library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
This document is currently split between _v003 and _v003_a and _v003_b and _v003_c due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:
Chapter 1 - Overview and Introduction
What is a hierarchical model?
Parts of a regression:
Random effects in regression:
School data:
Example code includes:
rawStudent <- read.csv("./RInputFiles/classroom.csv")
studentData <- rawStudent %>%
mutate(sex=factor(sex, labels=c("male", "female")), minority=factor(minority, labels=c("no", "yes")))
# Plot the data
ggplot(data = studentData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Fit a linear model
summary( lm(mathgain ~ housepov , data = studentData))
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
# I have aggregated the data for you into two new datasets at the classroom- and school-levels (As a side note, if you want to learn how to aggregate data, the dplyr or data.table courses teach these skills)
# We will also compare the model outputs across all three outputs
# Note: how we aggregate the data is important
# I aggregated the data by taking the mean across the student data (in pseudo-code: mean(mathgain) by school or mean(mathgain) by classroom),
# but another reasonable method for aggregating the data would be to aggregate by classroom first and school second
classData <- studentData %>%
group_by(schoolid, classid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(classData)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 312 obs. of 6 variables:
## $ schoolid: int 1 1 2 2 2 3 3 3 3 4 ...
## $ classid : int 160 217 197 211 307 11 137 145 228 48 ...
## $ mathgain: num 65.7 57.4 49.5 69 68.8 ...
## $ mathprep: num 2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
## $ housepov: num 0.082 0.082 0.082 0.082 0.082 0.086 0.086 0.086 0.086 0.365 ...
## $ yearstea: num 1 2 1 2 12.5 ...
## - attr(*, "vars")= chr "schoolid"
## - attr(*, "drop")= logi TRUE
schoolData <- studentData %>%
group_by(schoolid) %>%
summarize_at(vars(mathgain, mathprep, housepov, yearstea), mean, na.rm=TRUE)
str(schoolData)
## Classes 'tbl_df', 'tbl' and 'data.frame': 107 obs. of 5 variables:
## $ schoolid: int 1 2 3 4 5 6 7 8 9 10 ...
## $ mathgain: num 59.6 65 88.9 35.2 60.2 ...
## $ mathprep: num 2.91 2.35 2.95 2 3.75 ...
## $ housepov: num 0.082 0.082 0.086 0.365 0.511 0.044 0.148 0.085 0.537 0.346 ...
## $ yearstea: num 1.73 6.02 14.93 22 3 ...
# First, plot the hosepov and mathgain at the classroom-level from the classData data.frame
ggplot(data = classData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Second, plot the hosepov and mathgain at the school-level from the schoolData data.frame
ggplot(data = schoolData, aes(x = housepov, y = mathgain)) +
geom_point() +
geom_smooth(method = 'lm')
# Third, compare your liner regression results from the previous expercise to the two new models
summary( lm(mathgain ~ housepov, data = studentData)) ## student-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = studentData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.226 -22.222 -1.306 19.763 195.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.937 1.674 34.02 <2e-16 ***
## housepov 3.531 7.515 0.47 0.639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.63 on 1188 degrees of freedom
## Multiple R-squared: 0.0001858, Adjusted R-squared: -0.0006558
## F-statistic: 0.2208 on 1 and 1188 DF, p-value: 0.6385
summary( lm(mathgain ~ housepov, data = classData)) ## class-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = classData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.479 -14.444 -1.447 13.151 156.516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.160 2.542 22.879 <2e-16 ***
## housepov -3.236 10.835 -0.299 0.765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.14 on 310 degrees of freedom
## Multiple R-squared: 0.0002876, Adjusted R-squared: -0.002937
## F-statistic: 0.08918 on 1 and 310 DF, p-value: 0.7654
summary( lm(mathgain ~ housepov, data = schoolData)) ## school-level data
##
## Call:
## lm(formula = mathgain ~ housepov, data = schoolData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.660 -9.947 -2.494 9.546 41.445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.338 2.624 22.616 <2e-16 ***
## housepov -11.948 10.987 -1.087 0.279
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.8 on 105 degrees of freedom
## Multiple R-squared: 0.01114, Adjusted R-squared: 0.00172
## F-statistic: 1.183 on 1 and 105 DF, p-value: 0.2793
# Plot the means of your data, predictor is your x-variable, response is your y-variable, and intDemo is your data.frame
intDemo <- data.frame(predictor=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
response=c(-1.207, 0.277, 1.084, -2.346, 0.429, 5.759, 4.138, 4.18, 4.153, 3.665, 9.046, 8.003, 8.447, 10.129, 11.919)
)
str(intDemo)
## 'data.frame': 15 obs. of 2 variables:
## $ predictor: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ response : num -1.207 0.277 1.084 -2.346 0.429 ...
ggIntDemo <- ggplot(intDemo, aes(x = predictor, y = response) ) +
geom_point() +
theme_minimal() + stat_summary(fun.y = "mean", color = "red",
size = 3, geom = "point") +
xlab("Intercept groups")
print(ggIntDemo)
# Fit a linear model to your data where response is "predicted by"(~) predictor
intModel <- lm( response ~ predictor - 1 , data = intDemo)
summary(intModel)
##
## Call:
## lm(formula = response ~ predictor - 1, data = intDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9934 -0.7842 -0.2260 0.7056 2.4102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## predictora -0.3526 0.5794 -0.609 0.554
## predictorb 4.3790 0.5794 7.558 6.69e-06 ***
## predictorc 9.5088 0.5794 16.412 1.38e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.296 on 12 degrees of freedom
## Multiple R-squared: 0.9646, Adjusted R-squared: 0.9557
## F-statistic: 109 on 3 and 12 DF, p-value: 5.696e-09
extractAndPlotResults <- function(intModel){
intCoefPlot <- broom::tidy(intModel)
intCoefPlot$term <- factor(gsub("predictor", "", intCoefPlot$term))
plotOut <- ggIntDemo + geom_point(data = intCoefPlot,
aes(x = term, y = estimate),
position = position_dodge(width = 0.4),
color = 'blue', size = 8, alpha = 0.25)
print(plotOut)
}
# Run the next code that extracts out the model's coeffiecents and plots them
extractAndPlotResults(intModel)
multIntDemo <- data.frame(group=factor(c(rep("a", 5), rep("b", 5), rep("c", 5))),
x=rep(0:4, times=3),
intercept=c(4.11, -1.69, 1.09, 1.9, 1.21, 4.63, 10.29, 4.67, 12.06, 4.78, 15.22, 19.15, 4.44, 8.88, 9.47),
response=c(4.11, 2.31, 9.09, 13.9, 17.21, 4.63, 14.29, 12.67, 24.06, 20.78, 15.22, 23.15, 12.44, 20.88, 25.47)
)
str(multIntDemo)
## 'data.frame': 15 obs. of 4 variables:
## $ group : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 2 2 2 2 2 ...
## $ x : int 0 1 2 3 4 0 1 2 3 4 ...
## $ intercept: num 4.11 -1.69 1.09 1.9 1.21 ...
## $ response : num 4.11 2.31 9.09 13.9 17.21 ...
plot_output1 <- function(out1){
ggmultIntgDemo <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', fill = NA, color = 'orange', size = 3)
print(ggmultIntgDemo)
}
plot_output2 <- function(out2){
out2Tidy <- broom::tidy(out2)
out2Tidy$term <- gsub("group", "", out2Tidy$term)
out2Plot <- data.frame(group = out2Tidy[ -1, 1],
slope = out2Tidy[ 1, 2],
intercept = out2Tidy[ -1, 2])
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = out2Plot,
aes(intercept = intercept, slope = slope, color = group))
print(ggmultIntgDemo2)
}
plot_output3 <- function(out3){
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
print(ggmultIntgDemo3)
}
# First, run a model without considering different intercept for each group
out1 <- lm( response ~ x, data=multIntDemo )
summary(out1)
##
## Call:
## lm(formula = response ~ x, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.101 -4.021 -2.011 3.590 11.739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.141 2.615 3.113 0.00824 **
## x 3.270 1.068 3.062 0.00908 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.848 on 13 degrees of freedom
## Multiple R-squared: 0.4191, Adjusted R-squared: 0.3744
## F-statistic: 9.378 on 1 and 13 DF, p-value: 0.009081
plot_output1(out1)
# Considering same slope but different intercepts
out2 <- lm( response ~ x + group - 1, data=multIntDemo )
summary(out2)
##
## Call:
## lm(formula = response ~ x + group - 1, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.219 -0.234 1.810 6.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.2697 0.7516 4.350 0.001155 **
## groupa 2.7847 2.3767 1.172 0.266085
## groupb 8.7467 2.3767 3.680 0.003625 **
## groupc 12.8927 2.3767 5.425 0.000209 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.117 on 11 degrees of freedom
## Multiple R-squared: 0.9534, Adjusted R-squared: 0.9364
## F-statistic: 56.23 on 4 and 11 DF, p-value: 2.97e-07
plot_output2(out2)
# Consdering different slope and intercept for each group (i.e., an interaction)
out3 <- lm( response ~ x + group - 1 + x:group, multIntDemo)
summary(out3)
##
## Call:
## lm(formula = response ~ x + group - 1 + x:group, data = multIntDemo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.992 -2.429 -0.234 2.368 5.541
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 3.779 1.308 2.888 0.017941 *
## groupa 1.766 3.205 0.551 0.595053
## groupb 6.872 3.205 2.144 0.060621 .
## groupc 15.786 3.205 4.925 0.000819 ***
## x:groupb 0.428 1.851 0.231 0.822263
## x:groupc -1.956 1.851 -1.057 0.318050
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.138 on 9 degrees of freedom
## Multiple R-squared: 0.9615, Adjusted R-squared: 0.9358
## F-statistic: 37.42 on 6 and 9 DF, p-value: 7.263e-06
plot_output3(out3)
multIntDemo$intercept <- c(-0.87, 3.35, 1.25, 0.88, -1.05, 4.55, 1.22, 3.34, 1.26, 3.75, 7.71, 9.59, 2.28, 1.9, 13.35)
multIntDemo$response <- c(-0.87, 6.35, 7.25, 9.88, 10.95, 4.55, 4.22, 9.34, 10.26, 15.75, 7.71, 12.59, 8.28, 10.9, 25.35)
# Run model
outLmer <- lme4::lmer( response ~ x + ( 1 | group), multIntDemo)
# Look at model outputs
summary( outLmer )
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ x + (1 | group)
## Data: multIntDemo
##
## REML criterion at convergence: 76.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.31582 -0.61105 -0.01593 0.45125 2.19118
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 7.98 2.825
## Residual 10.71 3.272
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 3.5540 2.1913 1.622
## x 2.9733 0.5975 4.977
##
## Correlation of Fixed Effects:
## (Intr)
## x -0.545
broom::tidy( outLmer )
## term estimate std.error statistic group
## 1 (Intercept) 3.554000 2.1912887 1.621877 fixed
## 2 x 2.973333 0.5974739 4.976507 fixed
## 3 sd_(Intercept).group 2.824834 NA NA group
## 4 sd_Observation.Residual 3.272500 NA NA Residual
extractAndPlotOutput <- function(outLmer, slope=3){
multIntDemo$lmerPredict <- predict(outLmer)
ggmultIntgDemo2 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group))+
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
geom_abline(data = multIntDemo,
aes(intercept = intercept, slope = slope, color = group))
outPlot <- ggmultIntgDemo2 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict, color = group),
linetype = 2)
print(outPlot)
}
# Extract predictor variables and plot
extractAndPlotOutput(outLmer)
# Random effect slopes
multIntDemo$response <- c(-0.72, 1.5, 4.81, 6.61, 13.62, 10.21, 9.64, 11.91, 16.39, 16.97, 8.76, 14.79, 15.83, 15.27, 17.36)
multIntDemo$intercept <- c(-0.72, -1.5, -1.19, -2.39, 1.62, 10.21, 6.64, 5.91, 7.39, 4.97, 8.76, 11.79, 9.83, 6.27, 5.36)
outLmer2 <- lme4::lmer( response ~ ( x|group ), multIntDemo)
summary(outLmer2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: response ~ (x | group)
## Data: multIntDemo
##
## REML criterion at convergence: 69.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.56747 -0.54105 -0.06286 0.75141 1.27947
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## group (Intercept) 273.998 16.553
## x 6.096 2.469 -1.00
## Residual 2.466 1.570
## Number of obs: 15, groups: group, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 21.676 1.383 15.67
broom::tidy(outLmer2)
## term estimate std.error statistic group
## 1 (Intercept) 21.675699 1.383061 15.67226 fixed
## 2 sd_(Intercept).group 16.552871 NA NA group
## 3 sd_x.group 2.468959 NA NA group
## 4 cor_(Intercept).x.group -1.000000 NA NA group
## 5 sd_Observation.Residual 1.570349 NA NA Residual
plotOutput <- function(outLmer2){
multIntDemo$lmerPredict2 <- predict(outLmer2)
ggmultIntgDemo3 <- ggplot( multIntDemo, aes(x = x, y = response) ) +
geom_point(aes(color = group)) +
theme_minimal() +
scale_color_manual(values = c("blue", "black", "red")) +
stat_smooth(method = 'lm', aes(color = group), fill = NA)
plotOut <- ggmultIntgDemo3 +
geom_line( data = multIntDemo,
aes(x = x, y = lmerPredict2, color = group),
linetype = 2)
print(plotOut)
}
# Extract and plot
plotOutput(outLmer2)
# Mixed effect model
lmerModel <- lme4::lmer(mathgain ~ sex +
mathprep + mathknow + (1|classid) +
(1|schoolid), data = studentData, na.action = "na.omit",
REML = TRUE)
summary(lmerModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## mathgain ~ sex + mathprep + mathknow + (1 | classid) + (1 | schoolid)
## Data: studentData
##
## REML criterion at convergence: 10677.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3203 -0.6146 -0.0294 0.5467 5.5331
##
## Random effects:
## Groups Name Variance Std.Dev.
## classid (Intercept) 103.57 10.177
## schoolid (Intercept) 85.44 9.244
## Residual 1019.47 31.929
## Number of obs: 1081, groups: classid, 285; schoolid, 105
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 52.250 3.838 13.613
## sexfemale -1.526 2.041 -0.747
## mathprep 2.426 1.298 1.869
## mathknow 2.405 1.299 1.851
##
## Correlation of Fixed Effects:
## (Intr) sexfml mthprp
## sexfemale -0.268
## mathprep -0.878 0.001
## mathknow -0.003 0.011 0.005
extractAndPlot <- function(lmerModel){
modelOutPlot <- broom::tidy(lmerModel, conf.int = TRUE)
modelOutPlot <- modelOutPlot[ modelOutPlot$group =="fixed" &
modelOutPlot$term != "(Intercept)", ]
plotOut <- ggplot(modelOutPlot, aes(x = term, y = estimate,
ymin = conf.low,
ymax = conf.high)) +
theme_minimal() +
geom_hline(yintercept = 0.0, color = 'red', size = 2.0) +
geom_point() +
geom_linerange() + coord_flip()
print(plotOut)
}
# Extract and plot
extractAndPlot(lmerModel)
Chapter 2 - Linear Mixed-Effect Models
Linear mixed effect model - Birth rates data:
Understanding and reporting the outputs of lmer:
Statistical inference with Maryland crime data:
Example code includes:
# Read in births data
rawBirths <- read.csv("./RInputFiles/countyBirthsDataUse.csv")
countyBirthsData <- rawBirths
str(countyBirthsData)
## 'data.frame': 580 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ TotalPopulation : int 203709 115620 103057 104173 660367 156993 353089 415395 226519 119565 ...
## $ BirthRate : num 11.5 12.1 11.8 12.4 13.3 ...
## $ AverageBirthWeight: num 3261 3209 3239 3207 3177 ...
## $ AverageAgeofMother: num 27.5 26.3 25.8 26.9 27.9 ...
## $ CountyName : Factor w/ 472 levels "Ada","Adams",..: 22 64 141 189 200 229 248 273 278 279 ...
## $ State : Factor w/ 50 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
# First, build a lmer with state as a random effect. Then look at the model's summary and the plot of residuals.
birthRateStateModel <- lme4::lmer(BirthRate ~ (1|State), data=countyBirthsData)
summary(birthRateStateModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2411
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.7957 -0.6056 -0.1063 0.5211 5.5948
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.899 1.378
## Residual 3.256 1.804
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.3362 0.2216 55.67
plot(birthRateStateModel)
# Next, plot the predicted values from the model ontop of the plot shown during the video.
countyBirthsData$birthPredictState <- predict(birthRateStateModel, countyBirthsData)
ggplot() + theme_minimal() +
geom_point(data =countyBirthsData, aes(x = TotalPopulation, y = BirthRate)) +
geom_point(data = countyBirthsData, aes(x = TotalPopulation, y = birthPredictState),
color = 'blue', alpha = 0.5
)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_point).
# Include the AverageAgeofMother as a fixed effect within the lmer and state as a random effect
ageMotherModel <- lme4::lmer( BirthRate ~ AverageAgeofMother + (1|State), data=countyBirthsData)
summary(ageMotherModel)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (1 | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57033 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
# Compare the random-effect model to the linear effect model
summary(lm(BirthRate ~ AverageAgeofMother, data = countyBirthsData))
##
## Call:
## lm(formula = BirthRate ~ AverageAgeofMother, data = countyBirthsData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8304 -1.3126 -0.1795 1.2198 8.7327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.06637 1.83374 15.851 <2e-16 ***
## AverageAgeofMother -0.59380 0.06441 -9.219 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.065 on 576 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1286, Adjusted R-squared: 0.1271
## F-statistic: 84.99 on 1 and 576 DF, p-value: < 2.2e-16
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomCorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother + (AverageAgeofMother|State),
countyBirthsData)
summary(ageMotherModelRandomCorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula: BirthRate ~ AverageAgeofMother + (AverageAgeofMother | State)
## Data: countyBirthsData
##
## REML criterion at convergence: 2337.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.8399 -0.5966 -0.1133 0.5228 5.1815
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## State (Intercept) 78.75816 8.8746
## AverageAgeofMother 0.08482 0.2912 -0.99
## Residual 2.80306 1.6742
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.22042 2.41281 11.282
## AverageAgeofMother -0.52347 0.08302 -6.306
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.997
# Include the AverageAgeofMother as a correlated random-effect slope parameter
ageMotherModelRandomUncorrelated <- lme4::lmer(BirthRate ~ AverageAgeofMother +
(AverageAgeofMother || State), data=countyBirthsData
)
summary(ageMotherModelRandomUncorrelated)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## BirthRate ~ AverageAgeofMother + ((1 | State) + (0 + AverageAgeofMother |
## State))
## Data: countyBirthsData
##
## REML criterion at convergence: 2347.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.9602 -0.6086 -0.1042 0.5144 5.2686
##
## Random effects:
## Groups Name Variance Std.Dev.
## State (Intercept) 1.562 1.250
## State.1 AverageAgeofMother 0.000 0.000
## Residual 2.920 1.709
## Number of obs: 578, groups: State, 50
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 27.57033 1.81801 15.165
## AverageAgeofMother -0.53549 0.06349 -8.434
##
## Correlation of Fixed Effects:
## (Intr)
## AvrgAgfMthr -0.994
out <- ageMotherModelRandomUncorrelated
# Extract the fixed-effect coefficients
lme4::fixef(out)
## (Intercept) AverageAgeofMother
## 27.5703303 -0.5354886
# Extract the random-effect coefficients
lme4::ranef(out)
## $State
## (Intercept) AverageAgeofMother
## AK 1.03549149 0
## AL -0.52500819 0
## AR 0.48023356 0
## AZ -1.04094123 0
## CA 0.50530542 0
## CO 0.09585582 0
## CT -1.91638101 0
## DC 0.96029532 0
## DE -0.38938118 0
## FL -1.87440508 0
## GA 0.39776424 0
## HI 0.08513474 0
## IA 0.96279528 0
## ID 1.17377458 0
## IL -0.12739337 0
## IN -0.32655206 0
## KS 0.85650338 0
## KY 0.64871300 0
## LA 1.04437463 0
## MA -1.40082047 0
## MD 0.10842918 0
## ME -1.63520397 0
## MI -1.13797832 0
## MN 0.93266233 0
## MO 0.07081901 0
## MS -0.21397453 0
## MT -0.13190265 0
## NC -0.28681241 0
## ND 0.99847758 0
## NE 1.49390428 0
## NH -1.45440958 0
## NJ -0.30089452 0
## NM -0.69753301 0
## NV 0.09012925 0
## NY -0.58163335 0
## OH -1.07390325 0
## OK 0.77997159 0
## OR -0.75845586 0
## PA -1.59332743 0
## RI -1.36395356 0
## SC -0.59295090 0
## SD 1.35141914 0
## TN -0.13512968 0
## TX 1.70872465 0
## UT 3.66056804 0
## VA 1.59187553 0
## VT -0.51105276 0
## WA 0.23008359 0
## WI -0.51646717 0
## WV -0.67684007 0
# Estimate the confidence intervals
(ciOut <- confint(out))
## Computing profile confidence intervals ...
## 2.5 % 97.5 %
## .sig01 0.0000000 1.61243911
## .sig02 0.0000000 0.05033967
## .sigma 1.6091449 1.81592859
## (Intercept) 24.0121848 31.14668434
## AverageAgeofMother -0.6605319 -0.41123099
# Technical note: Extracting out the regression coefficients from lmer is tricky (see discussion between the lmer and broom authors development)
# Extract out the parameter estimates and confidence intervals and manipulate the data
dataPlot <- data.frame(cbind( lme4::fixef(out), ciOut[ 4:5, ]))
rownames(dataPlot)[1] <- "Intercept"
colnames(dataPlot) <- c("mean", "l95", "u95")
dataPlot$parameter <- rownames(dataPlot)
# Print the new dataframe
print(dataPlot)
## mean l95 u95 parameter
## Intercept 27.5703303 24.0121848 31.146684 Intercept
## AverageAgeofMother -0.5354886 -0.6605319 -0.411231 AverageAgeofMother
# Plot the results using ggplot2
ggplot(dataPlot, aes(x = parameter, y = mean,
ymin = l95, ymax = u95)) +
geom_hline( yintercept = 0, color = 'red' ) +
geom_linerange() + geom_point() + coord_flip() + theme_minimal()
# Read in crime data
rawCrime <- read.csv("./RInputFiles/MDCrime.csv")
MDCrime <- rawCrime
str(MDCrime)
## 'data.frame': 192 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ County: Factor w/ 24 levels "ALLEGANY","ANNE ARUNDEL",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Year : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Crime : int 3167 10871 5713 257 149 374 490 729 181 752 ...
## $ Year2 : int 0 0 0 0 0 0 0 0 0 0 ...
plot1 <- ggplot(data = MDCrime, aes(x = Year, y = Crime, group = County)) +
geom_line() + theme_minimal() +
ylab("Major crimes reported per county")
print(plot1)
plot1 + geom_smooth(method = 'lm')
# Null hypothesis testing uses p-values to see if a variable is "significant"
# Recently, the abuse and overuse of null hypothesis testing and p-values has caused the American Statistical Association to issue a statement about the use of p-values
# Because of these criticisms and other numerical challenges, Doug Bates (the creator of the lme4 package) does not include p-values as part of his package
# However, you may still want to estimate p-values, because p-values are sill commonly used. Several packages exist, including the lmerTest package
# https://www.amstat.org/asa/files/pdfs/P-ValueStatement.pdf
# Load lmerTest
# library(lmerTest)
# Fit the model with Year as both a fixed and random-effect
lme4::lmer(Crime ~ Year + (1 + Year | County) , data = MDCrime)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.338309
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year + (1 + Year | County)
## Data: MDCrime
## REML criterion at convergence: 2891.03
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 655.2118
## Year 0.8322 1.00
## Residual 328.2865
## Number of obs: 192, groups: County, 24
## Fixed Effects:
## (Intercept) Year
## 136642.97 -67.33
## convergence code 0; 3 optimizer warnings; 0 lme4 warnings
# Fit the model with Year2 rather than Year
out <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
# Examine the model's output
summary(out)
## Linear mixed model fit by REML ['lmerMod']
## Formula: Crime ~ Year2 + (1 + Year2 | County)
## Data: MDCrime
##
## REML criterion at convergence: 2535.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8081 -0.2235 -0.0390 0.2837 3.0768
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 7587959 2754.63
## Year2 16945 130.17 -0.91
## Residual 8425 91.79
## Number of obs: 192, groups: County, 24
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 1577.28 562.42 2.804
## Year2 -67.33 26.73 -2.519
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.907
## Build the Null model with only County as a random-effect
null_model <- lme4::lmer(Crime ~ (1 | County) , data = MDCrime)
## Build the Year2 model with Year2 as a fixed and random slope and County as the random-effect
year_model <- lme4::lmer(Crime ~ Year2 + (1 + Year2 | County) , data = MDCrime)
## Compare the two models using an anova
anova(null_model, year_model)
## refitting model(s) with ML (instead of REML)
## Data: MDCrime
## Models:
## null_model: Crime ~ (1 | County)
## year_model: Crime ~ Year2 + (1 + Year2 | County)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## null_model 3 2954.4 2964.2 -1474.2 2948.4
## year_model 6 2568.9 2588.4 -1278.4 2556.9 391.52 3 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Chapter 3 - Generalized Linear Mixed-Effect Models
Crash course on GLMs - relaxing the assumptions around normality of the residuals:
Binomial data - modeling data with only two outcomes:
Count data:
Example code includes:
# In this case study, we will be working with simulated dose-response data
# The response is mortality (1) or survival (0) at the end of a study. During this exercise, we will fit a logistic regression using all three methods described in the video
# You have been given two datasets. dfLong has the data in a "long" format with each row corresponding to an observation (i.e., a 0 or 1)
# dfShort has the data in an aggregated format with each row corresponding to a treatment (e.g., 6 successes, 4 failures, number of replicates = 10, proportion = 0.6)
dfLong <- data.frame(dose=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1)
)
str(dfLong)
## 'data.frame': 120 obs. of 2 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
dfShort <- dfLong %>%
group_by(dose) %>%
summarize(mortality=sum(mortality), nReps=n()) %>%
mutate(survival=nReps-mortality, mortalityP=mortality/nReps)
dfShort
## # A tibble: 6 x 5
## dose mortality nReps survival mortalityP
## <dbl> <dbl> <int> <dbl> <dbl>
## 1 0 0 20 20.0 0
## 2 2.00 4.00 20 16.0 0.200
## 3 4.00 8.00 20 12.0 0.400
## 4 6.00 10.0 20 10.0 0.500
## 5 8.00 11.0 20 9.00 0.550
## 6 10.0 13.0 20 7.00 0.650
# Fit a glm using data in a long format
fitLong <- glm(mortality ~ dose, data = dfLong, family = "binomial")
summary(fitLong)
##
## Call:
## glm(formula = mortality ~ dose, family = "binomial", data = dfLong)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5916 -0.8245 -0.4737 1.0440 1.8524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44532 -4.785 1.71e-06 ***
## dose 0.30663 0.06821 4.495 6.95e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 159.76 on 119 degrees of freedom
## Residual deviance: 134.71 on 118 degrees of freedom
## AIC: 138.71
##
## Number of Fisher Scoring iterations: 3
# Fit a glm using data in a short format with two columns
fitShort <- glm( cbind(mortality , survival ) ~ dose , data = dfShort, family = "binomial")
summary(fitShort)
##
## Call:
## glm(formula = cbind(mortality, survival) ~ dose, family = "binomial",
## data = dfShort)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
# Fit a glm using data in a short format with weights
fitShortP <- glm( mortalityP ~ dose , data = dfShort, weights = nReps , family = "binomial")
summary(fitShortP)
##
## Call:
## glm(formula = mortalityP ~ dose, family = "binomial", data = dfShort,
## weights = nReps)
##
## Deviance Residuals:
## 1 2 3 4 5 6
## -2.1186 0.2316 1.0698 0.6495 -0.2699 -0.6634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.13075 0.44537 -4.784 1.72e-06 ***
## dose 0.30663 0.06822 4.495 6.97e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 31.6755 on 5 degrees of freedom
## Residual deviance: 6.6214 on 4 degrees of freedom
## AIC: 27.415
##
## Number of Fisher Scoring iterations: 4
y <- c(0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 1, 2, 0, 1, 1, 0, 1, 5, 1, 1)
x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
# Fit the linear model
summary(lm(y ~ x))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3677 -0.6145 -0.2602 0.4297 3.4805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15263 0.50312 0.303 0.7651
## x 0.07594 0.04200 1.808 0.0873 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.083 on 18 degrees of freedom
## Multiple R-squared: 0.1537, Adjusted R-squared: 0.1067
## F-statistic: 3.269 on 1 and 18 DF, p-value: 0.08733
# Fit the generalized linear model
summary(glm(y ~ x, family = "poisson"))
##
## Call:
## glm(formula = y ~ x, family = "poisson")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6389 -0.9726 -0.3115 0.5307 2.1559
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.04267 0.60513 -1.723 0.0849 .
## x 0.08360 0.04256 1.964 0.0495 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 23.589 on 19 degrees of freedom
## Residual deviance: 19.462 on 18 degrees of freedom
## AIC: 52.17
##
## Number of Fisher Scoring iterations: 5
# Often, we want to "look" at our data and trends in our data
# ggplot2 allows us to add trend lines to our data
# The defult lines are created using a technique called local regression
# However, we can specify different models, including GLMs
# During this exercise, we'll see how to plot a GLM
# Plot the data using jittered points and the default stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(fill = 'pink', color = 'red')
## `geom_smooth()` using method = 'loess'
# Plot the data using jittered points and the the glm stat_smooth
ggplot(data = dfLong, aes(x = dose, y = mortality)) +
geom_jitter(height = 0.05, width = 0.1) +
stat_smooth(method = 'glm', method.args = list(family = "binomial"))
# library(lmerTest)
df <- data.frame(dose=rep(rep(c(0, 2, 4, 6, 8, 10), each=20), times=3),
mortality=c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1),
replicate=factor(rep(letters[1:3], each=120))
)
str(df)
## 'data.frame': 360 obs. of 3 variables:
## $ dose : num 0 0 0 0 0 0 0 0 0 0 ...
## $ mortality: num 0 0 0 0 0 0 0 0 0 0 ...
## $ replicate: Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
glmerOut <- lme4::glmer(mortality ~ dose + (1|replicate), family = 'binomial', data = df)
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: mortality ~ dose + (1 | replicate)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 378.1 389.8 -186.0 372.1 357
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.3484 -0.6875 -0.3031 0.6413 2.1907
##
## Random effects:
## Groups Name Variance Std.Dev.
## replicate (Intercept) 6.658e-15 8.16e-08
## Number of obs: 360, groups: replicate, 3
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.38736 0.27334 -8.734 <2e-16 ***
## dose 0.40948 0.04414 9.276 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## dose -0.884
# library(lmerTest)
# Fit the model and look at its summary
# modelOut <- lme4::glmer( cbind(Purchases, Pass) ~ friend + ranking + (1|city), data = allData, family = 'binomial')
# summary( modelOut)
# Compare outputs to a lmer model
# summary(lme4::lmer( Purchases/( Purchases + Pass) ~ friend + ranking + (1|city), data = allData))
# Run the code to see how to calculate odds ratios
# summary(modelOut)
# exp(fixef(modelOut)[2])
# exp(confint(modelOut)[3, ])
# Load lmerTest
# library(lmerTest)
userGroups <- data.frame(group=factor(rep(rep(LETTERS[1:4], each=10), times=2)),
webpage=factor(rep(c("old", "new"), each=40)),
clicks=c(0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 1, 2, 0, 1, 1, 0, 3, 2, 3, 1, 2, 4, 2, 1, 0, 2, 0, 1, 2, 0, 2, 1, 1, 2, 4, 2, 8, 1, 1, 1, 2, 1, 1, 0, 0, 3, 0, 1, 4, 1, 2, 0, 1, 1, 0, 0, 3, 2, 0, 3, 1, 2, 2, 0, 2, 3, 1, 3, 2, 4, 4, 2, 1, 5, 2)
)
str(userGroups)
## 'data.frame': 80 obs. of 3 variables:
## $ group : Factor w/ 4 levels "A","B","C","D": 1 1 1 1 1 1 1 1 1 1 ...
## $ webpage: Factor w/ 2 levels "new","old": 2 2 2 2 2 2 2 2 2 2 ...
## $ clicks : num 0 0 0 0 0 0 2 0 0 0 ...
# Fit a Poisson glmer
summary( lme4::glmer(clicks ~ webpage + (1|group), family = 'poisson', data = userGroups))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: clicks ~ webpage + (1 | group)
## Data: userGroups
##
## AIC BIC logLik deviance df.resid
## 255.5 262.6 -124.7 249.5 77
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.3999 -0.9104 -0.2340 0.4978 5.6126
##
## Random effects:
## Groups Name Variance Std.Dev.
## group (Intercept) 0.07093 0.2663
## Number of obs: 80, groups: group, 4
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5524 0.1797 3.074 0.00211 **
## webpageold -0.5155 0.1920 -2.685 0.00726 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## webpageold -0.400
# library(lmerTest)
rawIL <- read.csv("./RInputFiles/ILData.csv")
ILdata <- rawIL
str(ILdata)
## 'data.frame': 1808 obs. of 4 variables:
## $ age : Factor w/ 4 levels "15_19","20_24",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 0 0 0 0 0 0 0 0 0 0 ...
## $ county: Factor w/ 47 levels "ALEXANDER","BROWN",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ count : int 0 0 0 5 0 7 0 4 0 12 ...
# Age goes before year
modelOut <- lme4::glmer(count ~ age + year + (year|county), family = 'poisson', data = ILdata)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control
## $checkConv, : Model failed to converge with max|grad| = 0.00144074 (tol =
## 0.001, component 1)
summary(modelOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: count ~ age + year + (year | county)
## Data: ILdata
##
## AIC BIC logLik deviance df.resid
## 3215.6 3259.6 -1599.8 3199.6 1800
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4511 -0.0151 -0.0056 -0.0022 4.0053
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## county (Intercept) 129.9459 11.3994
## year 0.0648 0.2546 -1.00
## Number of obs: 1808, groups: county, 47
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.76258 2.13022 -5.052 4.36e-07 ***
## age20_24 -0.04152 0.03690 -1.125 0.261
## age25_29 -1.16262 0.05290 -21.976 < 2e-16 ***
## age30_34 -2.28278 0.08487 -26.898 < 2e-16 ***
## year 0.32708 0.25422 1.287 0.198
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) a20_24 a25_29 a30_34
## age20_24 -0.008
## age25_29 -0.006 0.341
## age30_34 -0.004 0.213 0.148
## year -0.764 0.000 0.000 0.000
## convergence code: 0
## Model failed to converge with max|grad| = 0.00144074 (tol = 0.001, component 1)
# Extract out fixed effects
lme4::fixef(modelOut)
## (Intercept) age20_24 age25_29 age30_34 year
## -10.76258497 -0.04151848 -1.16262225 -2.28277972 0.32708039
# Extract out random effects
lme4::ranef(modelOut)
## $county
## (Intercept) year
## ALEXANDER -0.2847724 0.006331741
## BROWN -0.2847724 0.006331741
## CALHOUN -0.2847724 0.006331741
## CARROLL 12.2418514 -0.260423999
## CASS -0.2847724 0.006331741
## CLARK 12.2137668 -0.268553354
## CLAY -0.2847724 0.006331741
## CRAWFORD 12.5037407 -0.265752695
## CUMBERLAND -0.2847724 0.006331741
## DE WITT 12.7456078 -0.277675211
## DOUGLAS 13.0751590 -0.306329903
## EDGAR 12.3642794 -0.283606045
## EDWARDS -0.2847724 0.006331741
## FAYETTE 12.8094530 -0.273060474
## FORD -0.2847724 0.006331741
## GALLATIN -0.2847724 0.006331741
## GREENE -0.2847724 0.006331741
## HAMILTON -0.2847724 0.006331741
## HANCOCK 12.8581265 -0.305650287
## HARDIN -0.2847724 0.006331741
## HENDERSON -0.2847724 0.006331741
## IROQUOIS 13.1616741 -0.311372907
## JASPER -0.2847724 0.006331741
## JERSEY 12.9202747 -0.272284048
## JO DAVIESS 12.7409389 -0.289747791
## JOHNSON -0.2847724 0.006331741
## LAWRENCE 12.3713561 -0.268571236
## MARSHALL -0.2847724 0.006331741
## MASON -0.2847724 0.006331741
## MENARD -0.2180916 0.004849989
## MERCER 12.7534193 -0.271678572
## MOULTRIE -0.2180916 0.004849989
## PIATT 12.5653132 -0.296687752
## PIKE 12.5310614 -0.259211299
## POPE -0.2180916 0.004849989
## PULASKI -0.2180916 0.004849989
## PUTNAM -0.2180916 0.004849989
## RICHLAND 12.0350865 -0.273928951
## SCHUYLER -0.2180916 0.004849989
## SCOTT -0.2180916 0.004849989
## SHELBY 12.5183293 -0.283472292
## STARK -0.2180916 0.004849989
## UNION 13.1465272 -0.308673332
## WABASH -0.2180916 0.004849989
## WASHINGTON -0.2180916 0.004849989
## WAYNE 12.1148896 -0.253234752
## WHITE -0.2180916 0.004849989
# Run code to see one method for plotting the data
ggplot(data = ILdata, aes(x = year, y = count, group = county)) +
geom_line() +
facet_grid(age ~ . ) +
stat_smooth( method = 'glm',
method.args = list( family = "poisson"), se = FALSE,
alpha = 0.5) +
theme_minimal()
Chapter 4 - Repeated Measures
An introduction to repeated measures:
Sleep study:
Hate in NY state?
Wrap up:
Example code includes:
y <- c(0.23, 2.735, -0.038, 6.327, -0.643, 1.69, -1.378, -1.228, -0.252, 2.014, -0.073, 6.101, 0.213, 3.127, -0.29, 8.395, -0.33, 2.735, 0.223, 1.301)
treat <- rep(c("before", "after"), times=10)
x <- rep(letters[1:10], each=2)
# Run a standard, non-paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = FALSE)
##
## Welch Two Sample t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -3.9043, df = 9.5409, p-value = 0.003215
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.594744 -1.512256
## sample estimates:
## mean of x mean of y
## -0.2338 3.3197
# Run a standard, paired t-test
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
library(lmerTest)
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(lme4)
# Run the paired-test like before
t.test(y[treat == "before"], y[treat == "after"], paired = TRUE)
##
## Paired t-test
##
## data: y[treat == "before"] and y[treat == "after"]
## t = -4.2235, df = 9, p-value = 0.002228
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.456791 -1.650209
## sample estimates:
## mean of the differences
## -3.5535
# Run a repeated-measures ANOVA
anova(lmer( y ~ treat + (1|x)))
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## treat 63.137 63.137 1 9 17.838 0.002228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data(sleepstudy, package="lme4")
str(sleepstudy)
## 'data.frame': 180 obs. of 3 variables:
## $ Reaction: num 250 259 251 321 357 ...
## $ Days : num 0 1 2 3 4 5 6 7 8 9 ...
## $ Subject : Factor w/ 18 levels "308","309","310",..: 1 1 1 1 1 1 1 1 1 1 ...
# Plot the data
ggplot(data = sleepstudy) +
geom_line(aes(x = Days, y = Reaction, group = Subject)) +
stat_smooth(aes(x = Days, y = Reaction), method = 'lm', size = 3, se = FALSE)
# Build a lm
lm( Reaction ~ Days, data = sleepstudy)
##
## Call:
## lm(formula = Reaction ~ Days, data = sleepstudy)
##
## Coefficients:
## (Intercept) Days
## 251.41 10.47
# Build a lmer
(lmerOut <- lmer( Reaction ~ Days + (1|Subject), data = sleepstudy))
## Linear mixed model fit by REML ['lmerModLmerTest']
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
## REML criterion at convergence: 1786.465
## Random effects:
## Groups Name Std.Dev.
## Subject (Intercept) 37.12
## Residual 30.99
## Number of obs: 180, groups: Subject, 18
## Fixed Effects:
## (Intercept) Days
## 251.41 10.47
# The lmer model you built during the previous exercise has been saved as lmerOut
# During this exercise, you will examine the effects of drug type using both an ANOVA framework and a regression framework
# Run an anova
anova(lmerOut)
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## Days 162703 162703 1 161 169.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Look at the regression coefficients
summary(lmerOut)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Reaction ~ Days + (1 | Subject)
## Data: sleepstudy
##
## REML criterion at convergence: 1786.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2257 -0.5529 0.0109 0.5188 4.2506
##
## Random effects:
## Groups Name Variance Std.Dev.
## Subject (Intercept) 1378.2 37.12
## Residual 960.5 30.99
## Number of obs: 180, groups: Subject, 18
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 251.4051 9.7467 22.8100 25.79 <2e-16 ***
## Days 10.4673 0.8042 161.0000 13.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Days -0.371
# Read in NY hate data
rawHate <- read.csv("./RInputFiles/hateNY.csv")
hate <- rawHate
str(hate)
## 'data.frame': 233 obs. of 4 variables:
## $ Year : int 2010 2011 2012 2013 2014 2015 2016 2013 2010 2011 ...
## $ County : Factor w/ 59 levels "Albany","Allegany",..: 1 1 1 1 1 1 1 2 3 3 ...
## $ TotalIncidents: int 13 7 5 3 3 3 3 1 22 11 ...
## $ Year2 : int 0 1 2 3 4 5 6 3 0 1 ...
ggplot( data = hate, aes(x = Year, y = TotalIncidents, group = County)) +
geom_line() +
geom_smooth(method = 'lm', se = FALSE)
# During this exercise, you will build a glmer
# Because most of the incidents are small count values, use a Poisson (R function family poisson) error term
# First, build a model using the actually year (variable Year, such as 2006, 2007, etc) - this model will fail
# Second, build a model using the rescaled year (variable Year2, such as 0, 1, 2, etc)
# This demonstrates the importance of considering where the intercept is located when building regression models
# Recall that a variable x can be both a fixed and random effect in a lmer() or glmer(): for example lmer(y ~ x + (x| group) demonstrates this syntax
# glmer with raw Year
glmer(TotalIncidents ~ Year + (Year|County), data = hate, family = "poisson")
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.370207
## (tol = 0.001, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year + (Year | County)
## Data: hate
## AIC BIC logLik deviance df.resid
## 1165.2746 1182.5298 -577.6373 1155.2746 228
## Random effects:
## Groups Name Std.Dev. Corr
## County (Intercept) 217.8915
## Year 0.1084 -1.00
## Number of obs: 233, groups: County, 59
## Fixed Effects:
## (Intercept) Year
## 295.4814 -0.1464
## convergence code 0; 3 optimizer warnings; 0 lme4 warnings
# glmer with scaled Year, Year2
glmerOut <- glmer(TotalIncidents ~ Year2 + (Year2|County), data = hate, family = "poisson")
summary(glmerOut)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: TotalIncidents ~ Year2 + (Year2 | County)
## Data: hate
##
## AIC BIC logLik deviance df.resid
## 1165.3 1182.5 -577.6 1155.3 228
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.5434 -0.4864 -0.1562 0.3319 3.1939
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## County (Intercept) 1.16291 1.0784
## Year2 0.01175 0.1084 0.02
## Number of obs: 233, groups: County, 59
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.27952 0.16600 7.708 1.28e-14 ***
## Year2 -0.14622 0.03324 -4.398 1.09e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Year2 -0.338
# Extract and manipulate data
countyTrend <- ranef(glmerOut)$County
countyTrend$county <- factor(row.names(countyTrend), levels =row.names(countyTrend)[order(countyTrend$Year2)])
# Plot results
ggplot(data = countyTrend, aes(x = county, y = Year2)) + geom_point() +
coord_flip() +
ylab("Change in hate crimes per year") +
xlab("County")
Chapter 1 - Forecasting Demand with Time Series
Loading data in to an xts object:
ARIMA Time Series 101:
Forecasting and Evaluating:
Example code includes:
# Read in beverages data
rawBev <- read.csv("./RInputFiles/Bev.csv")
bev <- rawBev
str(bev)
## 'data.frame': 176 obs. of 14 variables:
## $ M.hi.p : num 59.2 56.3 56.3 49.3 61.3 ...
## $ M.lo.p : num 29.2 26.3 26.2 26.1 25.9 ...
## $ MET.hi.p: num 63.7 60.3 60.8 55.1 65.1 ...
## $ MET.lo.p: num 26 25.5 25.7 26.5 25.7 ...
## $ MET.sp.p: num 50.1 48.8 48.6 47.7 50.8 ...
## $ SEC.hi.p: num 58.6 54.6 57.9 49.7 63.7 ...
## $ SEC.lo.p: num 29.2 26.3 26.2 26.1 25.9 ...
## $ M.hi : int 458 477 539 687 389 399 392 417 568 583 ...
## $ M.lo : int 1455 1756 2296 3240 2252 1901 1939 1999 1798 1558 ...
## $ MET.hi : int 2037 1700 1747 2371 1741 2072 2353 2909 3204 2395 ...
## $ MET.lo : int 3437 3436 3304 3864 3406 3418 3553 3376 3233 3262 ...
## $ MET.sp : int 468 464 490 657 439 453 423 408 501 481 ...
## $ SEC.hi : int 156 151 178 217 141 149 134 148 195 170 ...
## $ SEC.lo : int 544 624 611 646 624 610 623 599 551 539 ...
# Load xts package
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(forecast)
# Create the dates object as an index for your xts object
dates <- seq(as.Date("2014-01-19"), length = 176, by = "weeks")
# Create an xts object called bev_xts
bev_xts <- xts(bev, order.by = dates)
# Create the individual region sales as their own objects
MET_hi <- bev_xts[,"MET.hi"]
MET_lo <- bev_xts[,"MET.lo"]
MET_sp <- bev_xts[,"MET.sp"]
# Sum the region sales together
MET_t <- MET_hi + MET_lo + MET_sp
# Plot the metropolitan region total sales
plot(MET_t)
# Split the data into training and validation
MET_t_train <- MET_t[index(MET_t) < "2017-01-01"]
MET_t_valid <- MET_t[index(MET_t) >= "2017-01-01"]
# Use auto.arima() function for metropolitan sales
MET_t_model <- auto.arima(MET_t_train)
# Forecast the first 22 weeks of 2017
forecast_MET_t <- forecast(MET_t_model, h = 22)
# Plot this forecast #
plot(forecast_MET_t)
# Convert to numeric for ease
for_MET_t <- as.numeric(forecast_MET_t$mean)
v_MET_t <- as.numeric(MET_t_valid)
# Calculate the MAE
MAE <- mean(abs(for_MET_t - v_MET_t))
# Calculate the MAPE
MAPE <- 100*mean(abs(for_MET_t - v_MET_t)/v_MET_t)
# Print to see how good your forecast is!
print(MAE)
## [1] 898.8403
print(MAPE)
## [1] 17.10332
# Convert your forecast to an xts object
for_dates <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_t_xts <- xts(forecast_MET_t$mean, order.by = for_dates)
# Plot the validation data set
plot(for_MET_t_xts, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(MET_t_valid, col = "blue")
# Plot the validation data set
plot(MET_t_valid, main = 'Forecast Comparison', ylim = c(4000, 8500))
# Overlay the forecast of 2017
lines(for_MET_t_xts, col = "blue")
# Convert the limits to xts objects
lower <- xts(forecast_MET_t$lower[, 2], order.by = for_dates)
upper <- xts(forecast_MET_t$upper[, 2], order.by = for_dates)
# Adding confidence intervals of forecast to plot
lines(lower, col = "blue", lty = "dashed")
lines(upper, col = "blue", lty = "dashed")
Chapter 2 - Components of Demand
Price elasticity:
Seasonal/holiday/promotional effects:
Forecasting with regression:
Example code includes:
bev_xts_train <- bev_xts[index(bev_xts) < "2017-01-01"]
bev_xts_valid <- bev_xts[index(bev_xts) >= "2017-01-01"]
# Save the prices of each product
l_MET_hi_p <- log(as.vector(bev_xts_train[, "MET.hi.p"]))
# Save as a data frame
MET_hi_train <- data.frame(as.vector(log(MET_hi[index(MET_hi) < "2017-01-01"])), l_MET_hi_p)
colnames(MET_hi_train) <- c("log_sales", "log_price")
# Calculate the regression
model_MET_hi <- lm(log_sales ~ log_price, data = MET_hi_train)
# Plot the product's sales
plot(MET_hi)
# Plot the product's price
MET_hi_p <- bev_xts_train[, "MET.hi.p"]
plot(MET_hi_p)
# Create date indices for New Year's week
n.dates <- as.Date(c("2014-12-28", "2015-12-27", "2016-12-25"))
# Create xts objects for New Year's
newyear <- as.xts(rep(1, 3), order.by = n.dates)
# Create sequence of dates for merging
dates_train <- seq(as.Date("2014-01-19"), length = 154, by = "weeks")
# Merge training dates into New Year's object
newyear <- merge(newyear, dates_train, fill = 0)
# Add newyear variable to your data frame
MET_hi_train <- data.frame(MET_hi_train, newyear=as.vector(newyear))
# Build regressions for the product
model_MET_hi_full <- lm(log_sales ~ log_price + newyear, data = MET_hi_train)
# Subset the validation prices #
l_MET_hi_p_valid <- log(as.vector(bev_xts_valid[, "MET.hi.p"]))
# Create a validation data frame #
MET_hi_valid <- data.frame(l_MET_hi_p_valid)
colnames(MET_hi_valid) <- "log_price"
# Predict the log of sales for your high end product
pred_MET_hi <- predict(model_MET_hi, MET_hi_valid)
# Convert predictions out of log scale
pred_MET_hi <- exp(pred_MET_hi)
# Convert to an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
pred_MET_hi_xts <- xts(pred_MET_hi, order.by = dates_valid)
# Plot the forecast
plot(pred_MET_hi_xts)
# Calculate and print the MAPE
MET_hi_v <- bev_xts_valid[,"MET.hi"]
MAPE <- 100*mean(abs((pred_MET_hi_xts - MET_hi_v)/MET_hi_v))
print(MAPE)
## [1] 29.57455
Chapter 3 - Blending Regression with Time Series
Residuals from regression model:
Forecasting residuals:
Transfer functions and ensembling:
Example code includes:
# Calculate the residuals from the model
MET_hi_full_res <- resid(model_MET_hi_full)
# Convert the residuals to an xts object
MET_hi_full_res <- xts(MET_hi_full_res, order.by = dates_train)
# Plot the histogram of the residuals
hist(MET_hi_full_res)
# Plot the residuals over time
plot(MET_hi_full_res)
# Build an ARIMA model on the residuals: MET_hi_arima
MET_hi_arima <- auto.arima(MET_hi_full_res)
# Look at a summary of the model
summary(MET_hi_arima)
## Series: MET_hi_full_res
## ARIMA(2,0,2) with zero mean
##
## Coefficients:
## ar1 ar2 ma1 ma2
## 1.5736 -0.7833 -0.8149 0.2865
## s.e. 0.0992 0.0758 0.1266 0.0941
##
## sigma^2 estimated as 0.03921: log likelihood=32.19
## AIC=-54.37 AICc=-53.97 BIC=-39.19
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0004334041 0.1954382 0.145018 -47.18396 223.0022 0.5227844
## ACF1
## Training set -0.01034041
# Forecast 22 weeks with your model: for_MET_hi_arima
for_MET_hi_arima <- forecast(MET_hi_arima, h=22)
# Print first 10 observations
head(for_MET_hi_arima$mean, n = 10)
## Time Series:
## Start = 1079
## End = 1142
## Frequency = 0.142857142857143
## [1] -0.07662326 -0.10617141 -0.10705342 -0.08529747 -0.05037188
## [6] -0.01245420 0.01985656 0.04100076 0.04896519 0.04493645
# Convert your forecasts into an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_arima <- xts(for_MET_hi_arima$mean, order.by = dates_valid)
# Plot the forecast
plot(for_MET_hi_arima)
# Convert your residual forecast to the exponential version
for_MET_hi_arima <- exp(for_MET_hi_arima)
# Multiply your forecasts together!
for_MET_hi_final <- for_MET_hi_arima * pred_MET_hi_xts
# Plot the final forecast - don't touch the options!
plot(for_MET_hi_final, ylim = c(1000, 4300))
# Overlay the validation data set
lines(MET_hi_v, col = "blue")
# Calculate the MAE
MAE <- mean(abs(for_MET_hi_final - MET_hi_v))
print(MAE)
## [1] 481.6678
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_hi_final - MET_hi_v)/MET_hi_v)
print(MAPE)
## [1] 28.82836
# Build an ARIMA model using the auto.arima function
MET_hi_model_arima <- auto.arima(MET_hi)
# Forecast the ARIMA model
for_MET_hi <- forecast(MET_hi_model_arima, h = length(MET_hi_v))
# Save the forecast as an xts object
dates_valid <- seq(as.Date("2017-01-01"), length = 22, by = "weeks")
for_MET_hi_xts <- xts(for_MET_hi$mean, order.by = dates_valid)
# Calculate the MAPE of the forecast
MAPE <- 100 * mean(abs(for_MET_hi_xts - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 36.95411
# Ensemble the two forecasts together
for_MET_hi_en <- 0.5 * (for_MET_hi_xts + pred_MET_hi_xts)
# Calculate the MAE and MAPE
MAE <- mean(abs(for_MET_hi_en - MET_hi_v))
print(MAE)
## [1] 533.8911
MAPE <- 100 * mean(abs(for_MET_hi_en - MET_hi_v) / MET_hi_v)
print(MAPE)
## [1] 32.28549
Chapter 4 - Hierarchical Forecasting
Bottom-Up Hierarchical Forecasting:
Top-Down Hierarchical Forecasting:
Middle-Out Hierarchical Forecasting:
Wrap up:
Example code includes:
# Build a time series model
MET_sp_model_arima <- auto.arima(MET_sp)
# Forecast the time series model for 22 periods
for_MET_sp <- forecast(MET_sp_model_arima, h=22)
# Create an xts object
for_MET_sp_xts <- xts(for_MET_sp$mean, order.by=dates_valid)
MET_sp_v <- MET_sp["2017"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_xts - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.703272
MET_sp_train <- bev_xts_train %>%
transform(log_sales = log(MET.sp), log_price=log(MET.sp.p))
MET_sp_train <- MET_sp_train[, c("log_sales", "log_price")]
MET_sp_train$newyear <- 0
MET_sp_train$valentine <- 0
MET_sp_train$christmas <- 0
MET_sp_train$mother <- 0
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-28", "2015-12-27", "2016-12-25")), "newyear"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-02-09", "2015-02-08", "2016-02-07")), "valentine"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-12-21", "2015-12-20", "2016-12-18")), "christmas"] <- 1
MET_sp_train[index(MET_sp_train) %in% as.Date(c("2014-05-04", "2015-05-03", "2016-05-01")), "mother"] <- 1
# THE BELOW IS TOTAL NONSENSE
# Build a regression model
model_MET_sp <- lm(log_sales ~ log_price + newyear + valentine + christmas + mother, data = MET_sp_train)
MET_sp_valid <- as.data.frame(bev_xts_valid) %>%
mutate(log_sales = log(MET.sp), log_price=log(MET.sp.p)) %>%
select("log_sales", "log_price")
MET_sp_valid$newyear <- 0
MET_sp_valid$valentine <- 0
MET_sp_valid$christmas <- 0
MET_sp_valid$mother <- 0
MET_sp_valid[7, "valentine"] <- 1
MET_sp_valid[19, "mother"] <- 1
MET_sp_valid$log_sales <- NULL
# Forecast the regression model using the predict function
pred_MET_sp <- predict(model_MET_sp, MET_sp_valid)
# Exponentiate your predictions and create an xts object
pred_MET_sp <- exp(pred_MET_sp)
pred_MET_sp_xts <- xts(pred_MET_sp, order.by = dates_valid)
# Calculate MAPE
MAPE <- 100*mean(abs((pred_MET_sp_xts - MET_sp_v)/MET_sp_v))
print(MAPE)
## [1] 6.55473
# Ensemble the two forecasts
for_MET_sp_en <- 0.5 * (for_MET_sp_xts + pred_MET_sp_xts)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_sp_en - MET_sp_v) / MET_sp_v)
print(MAPE)
## [1] 6.048594
# Copy over pred_MET_lo_xts
pred_MET_lo_xts <- xts(c(2960.6, 2974.1, 2943.2, 2948.6, 2915.6, 2736.4, 2953.9, 3199.4, 2934, 2898.7, 3027.7, 3165.9, 3073.1, 2842.7, 2928.7, 3070.2, 2982.2, 3018, 3031.9, 2879.4, 2993.2, 2974.1), order.by=dates_valid)
# Calculate the metropolitan regional sales forecast
for_MET_total <- pred_MET_hi_xts + for_MET_sp_en + pred_MET_lo_xts
# Calculate a validation data set
MET_t_v <- bev_xts_valid[,"MET.hi"] + bev_xts_valid[,"MET.lo"] + bev_xts_valid[,"MET.sp"]
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_total - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 10.61952
# Create the MET_total data
MET_total <- xts(data.frame(MET.hi=c(5942, 5600, 5541, 6892, 5586, 5943, 6329, 6693, 6938, 6138, 6361, 6378, 5423, 5097, 4937, 5496, 6870, 6626, 6356, 5657, 6577, 7202, 7381, 7404, 7204, 6667, 6153, 6035, 5633, 5283, 5178, 4758, 5058, 5254, 5954, 6166, 6247, 6304, 7202, 6662, 6814, 6174, 5412, 5380, 5674, 6472, 6912, 7404, 8614, 8849, 7174, 6489, 7174, 6555, 6402, 7671, 5012, 4790, 5075, 5238, 5615, 6113, 7706, 7811, 7898, 7232, 6585, 5870, 7084, 5125, 5330, 5553, 6349, 6195, 6271, 5851, 5333, 5854, 5609, 5649, 6051, 6409, 5786, 5190, 5085, 4949, 5151, 5147, 5426, 5509, 6956, 7870, 8224, 6685, 6153, 5802, 5244, 5162, 5036, 5025, 8378, 8944, 7109, 7605, 7846, 7598, 8012, 9551, 6102, 5366, 4932, 4962, 5392, 6194, 7239, 7621, 7460, 7097, 6596, 5848, 8306, 5344, 5848, 6341, 7364, 7269, 7053, 6682, 6971, 7521, 7063, 6298, 6003, 5227, 5047, 4877, 4851, 4628, 4516, 4442, 4935, 5181, 5431, 5866, 5919, 5704, 5957, 6019, 5962, 6021, 5880, 5674, 7439, 7415)),
order.by=dates_train
)
# Build a regional time series model
MET_t_model_arima <- auto.arima(MET_total)
# Calculate a 2017 forecast for 22 periods
for_MET_t <- forecast(MET_t_model_arima, h=22)
# Make an xts object from your forecast
for_MET_t_xts <- xts(for_MET_t$mean, order.by=dates_valid)
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_MET_t_xts - MET_t_v) / MET_t_v)
print(MAPE)
## [1] 17.10332
# Calculate the average historical proportions
prop_hi <- mean(MET_hi/MET_total)
prop_lo <- mean(MET_lo/MET_total)
prop_sp <- mean(MET_sp/MET_total)
# Distribute out your forecast to each product
for_prop_hi <- prop_hi*for_MET_t_xts
for_prop_lo <- prop_lo*for_MET_t_xts
for_prop_sp <- prop_sp*for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.7318
MET_lo_v <- bev_xts_valid[,"MET.lo"]
MAPE_lo <- 100 * mean(abs(for_prop_lo - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 10.70649
MAPE_sp <- 100 * mean(abs(for_prop_sp - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.232888
# Calculate the average historical proportions
prop_hi_2 <- mean(MET_hi) / mean(MET_total)
prop_lo_2 <- mean(MET_lo) / mean(MET_total)
prop_sp_2 <- mean(MET_sp) / mean(MET_total)
# Distribute out your forecast to each product
for_prop_hi_2 <- prop_hi_2 * for_MET_t_xts
for_prop_lo_2 <- prop_lo_2 * for_MET_t_xts
for_prop_sp_2 <- prop_sp_2 * for_MET_t_xts
# Calculate the MAPE's for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi_2 - MET_hi_v) / MET_hi_v)
print(MAPE_hi)
## [1] 38.33559
MAPE_lo <- 100 * mean(abs(for_prop_lo_2 - MET_lo_v) / MET_lo_v)
print(MAPE_lo)
## [1] 8.450784
MAPE_sp <- 100 * mean(abs(for_prop_sp_2 - MET_sp_v) / MET_sp_v)
print(MAPE_sp)
## [1] 6.517045
SEC_total <- xts(data.frame(SEC.hi=c(700, 775, 789, 863, 765, 759, 757, 747, 746, 709, 749, 786, 796, 726, 727, 723, 778, 755, 739, 740, 723, 695, 727, 707, 725, 684, 667, 698, 727, 722, 748, 695, 742, 739, 715, 724, 686, 671, 688, 682, 710, 700, 672, 680, 695, 780, 751, 693, 809, 881, 703, 712, 768, 796, 808, 904, 641, 662, 693, 725, 719, 736, 715, 722, 732, 745, 689, 705, 811, 739, 744, 700, 745, 735, 732, 722, 721, 732, 750, 714, 752, 677, 731, 674, 720, 675, 741, 722, 715, 719, 649, 697, 743, 733, 772, 698, 690, 734, 713, 644, 788, 833, 749, 731, 670, 675, 675, 993, 773, 751, 697, 677, 750, 723, 780, 763, 721, 701, 704, 684, 985, 791, 731, 714, 704, 694, 685, 652, 708, 754, 747, 705, 711, 699, 712, 745, 706, 665, 666, 692, 676, 696, 689, 697, 689, 717, 697, 708, 660, 707, 715, 680, 922, 888)), order.by=dates_train
)
# Build a time series model for the region
SEC_t_model_arima <- auto.arima(SEC_total)
# Forecast the time series model
for_SEC_t <- forecast(SEC_t_model_arima, h=22)
# Make into an xts object
for_SEC_t_xts <- xts(for_SEC_t$mean, order.by=dates_valid)
SEC_t_v <- bev_xts_valid$SEC.hi + bev_xts_valid$SEC.lo
# Calculate the MAPE
MAPE <- 100 * mean(abs(for_SEC_t_xts - SEC_t_v) / SEC_t_v)
print(MAPE)
## [1] 4.742324
SEC_hi <- bev_xts_train[, "SEC.hi"]
SEC_lo <- bev_xts_train[, "SEC.lo"]
SEC_hi_v <- bev_xts_valid[, "SEC.hi"]
SEC_lo_v <- bev_xts_valid[, "SEC.lo"]
# Calculate the average of historical proportions
prop_hi <- mean(SEC_hi / SEC_total)
prop_lo <- mean(SEC_lo / SEC_total)
# Distribute the forecast
for_prop_hi <- prop_hi * for_SEC_t_xts
for_prop_lo <- prop_lo * for_SEC_t_xts
# Calculate a MAPE for each product
MAPE_hi <- 100 * mean(abs(for_prop_hi - SEC_hi_v) / SEC_hi_v)
print(MAPE_hi)
## [1] 7.988508
MAPE_lo <- 100 * mean(abs(for_prop_lo - SEC_lo_v) / SEC_lo_v)
print(MAPE_lo)
## [1] 5.202529
# Copy over for_M_t_xts data
for_M_t_xts <- xts(c(2207, 2021, 2010, 2052, 2075, 2074, 2065, 2058, 2056, 2055, 2053, 2052, 2050, 2049, 2048, 2047, 2046, 2045, 2044, 2043, 2043, 2042), order.by=dates_valid)
# Calculate the state sales forecast: for_state
for_state = for_SEC_t_xts + for_MET_t_xts + for_M_t_xts
# See the forecasts
for_state
## [,1]
## 2017-01-01 9996.689
## 2017-01-08 9525.915
## 2017-01-15 9342.760
## 2017-01-22 9269.321
## 2017-01-29 9214.912
## 2017-02-05 9162.005
## 2017-02-12 9118.199
## 2017-02-19 9087.859
## 2017-02-26 9070.209
## 2017-03-05 9058.715
## 2017-03-12 9049.677
## 2017-03-19 9043.959
## 2017-03-26 9038.794
## 2017-04-02 9035.673
## 2017-04-09 9033.250
## 2017-04-16 9031.296
## 2017-04-23 9029.656
## 2017-04-30 9028.227
## 2017-05-07 9026.939
## 2017-05-14 9025.746
## 2017-05-21 9025.617
## 2017-05-28 9024.530
Chapter 1 - Identifying the Best Recruiting Source
Introduction - Ben Teusch, HR Analytics Consultant:
Recruiting and quality of hire:
Visualizing recruiting data:
Example code includes:
# Import the recruitment data
recruitment <- readr::read_csv("./RInputFiles/recruitment_data.csv")
## Parsed with column specification:
## cols(
## attrition = col_integer(),
## performance_rating = col_integer(),
## sales_quota_pct = col_double(),
## recruiting_source = col_character()
## )
# Look at the first few rows of the dataset
head(recruitment)
## # A tibble: 6 x 4
## attrition performance_rating sales_quota_pct recruiting_source
## <int> <int> <dbl> <chr>
## 1 1 3 1.09 Applied Online
## 2 0 3 2.39 <NA>
## 3 1 2 0.498 Campus
## 4 0 2 2.51 <NA>
## 5 0 3 1.42 Applied Online
## 6 1 3 0.548 Referral
# Get an overview of the recruitment data
summary(recruitment)
## attrition performance_rating sales_quota_pct recruiting_source
## Min. :0.000 Min. :1.000 Min. :-0.7108 Length:446
## 1st Qu.:0.000 1st Qu.:2.000 1st Qu.: 0.5844 Class :character
## Median :0.000 Median :3.000 Median : 1.0701 Mode :character
## Mean :0.213 Mean :2.895 Mean : 1.0826
## 3rd Qu.:0.000 3rd Qu.:3.000 3rd Qu.: 1.5325
## Max. :1.000 Max. :5.000 Max. : 3.6667
# See which recruiting sources the company has been using
recruitment %>%
count(recruiting_source)
## # A tibble: 5 x 2
## recruiting_source n
## <chr> <int>
## 1 Applied Online 130
## 2 Campus 56
## 3 Referral 45
## 4 Search Firm 10
## 5 <NA> 205
# Find the average sales quota attainment for each recruiting source
avg_sales <- recruitment %>%
group_by(recruiting_source) %>%
summarize(avg_sales_quota_pct=mean(sales_quota_pct))
# Display the result
avg_sales
## # A tibble: 5 x 2
## recruiting_source avg_sales_quota_pct
## <chr> <dbl>
## 1 Applied Online 1.06
## 2 Campus 0.908
## 3 Referral 1.02
## 4 Search Firm 0.887
## 5 <NA> 1.17
# Find the average attrition for the sales team, by recruiting source, sorted from lowest attrition rate to highest
avg_attrition <- recruitment %>%
group_by(recruiting_source) %>%
summarize(attrition_rate=mean(attrition)) %>%
arrange(attrition_rate)
# Display the result
avg_attrition
## # A tibble: 5 x 2
## recruiting_source attrition_rate
## <chr> <dbl>
## 1 <NA> 0.132
## 2 Applied Online 0.246
## 3 Campus 0.286
## 4 Referral 0.333
## 5 Search Firm 0.500
# Plot the bar chart
avg_sales %>% ggplot(aes(x=recruiting_source, y=avg_sales_quota_pct)) + geom_col()
# Plot the bar chart
avg_attrition %>% ggplot(aes(x=recruiting_source, y=attrition_rate)) + geom_col()
Chapter 2 - What is driving low employee engagement
Analyzing employee engagement:
Visualizing the engagement data:
Are differences meaningful?
Example code includes:
# Import the data
survey <- readr::read_csv("./RInputFiles/survey_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## engagement = col_integer(),
## salary = col_double(),
## vacation_days_taken = col_integer()
## )
# Get an overview of the data
summary(survey)
## employee_id department engagement salary
## Min. : 1.0 Length:1470 Min. :1.00 Min. : 45530
## 1st Qu.: 491.2 Class :character 1st Qu.:3.00 1st Qu.: 59407
## Median :1020.5 Mode :character Median :3.00 Median : 70481
## Mean :1024.9 Mean :3.05 Mean : 74162
## 3rd Qu.:1555.8 3rd Qu.:4.00 3rd Qu.: 84763
## Max. :2068.0 Max. :5.00 Max. :164073
## vacation_days_taken
## Min. : 0.00
## 1st Qu.: 6.00
## Median :10.00
## Mean :11.27
## 3rd Qu.:16.00
## Max. :38.00
# Examine the counts of the department variable
survey %>% count(department)
## # A tibble: 3 x 2
## department n
## <chr> <int>
## 1 Engineering 961
## 2 Finance 63
## 3 Sales 446
# Output the average engagement score for each department, sorted
survey %>%
group_by(department) %>%
summarize(avg_engagement=mean(engagement)) %>%
arrange(avg_engagement)
## # A tibble: 3 x 2
## department avg_engagement
## <chr> <dbl>
## 1 Sales 2.81
## 2 Engineering 3.15
## 3 Finance 3.24
# Create the disengaged variable and assign the result to survey
survey_disengaged <- survey %>%
mutate(disengaged = ifelse(engagement <= 2, 1, 0))
survey_disengaged
## # A tibble: 1,470 x 6
## employee_id department engagement salary vacation_days_tak~ disengaged
## <int> <chr> <int> <dbl> <int> <dbl>
## 1 1 Sales 3 103264 7 0
## 2 2 Engineering 3 80709 12 0
## 3 4 Engineering 3 60737 12 0
## 4 5 Engineering 3 99116 7 0
## 5 7 Engineering 3 51022 18 0
## 6 8 Engineering 3 98400 9 0
## 7 10 Engineering 3 57106 18 0
## 8 11 Engineering 1 55065 4 1.00
## 9 12 Engineering 4 77158 12 0
## 10 13 Engineering 2 48365 14 1.00
## # ... with 1,460 more rows
# Summarize the three variables by department
survey_summary <- survey_disengaged %>%
group_by(department) %>%
summarize(pct_disengaged=mean(disengaged),
avg_salary=mean(salary),
avg_vacation_taken=mean(vacation_days_taken)
)
survey_summary
## # A tibble: 3 x 4
## department pct_disengaged avg_salary avg_vacation_taken
## <chr> <dbl> <dbl> <dbl>
## 1 Engineering 0.206 73576 12.2
## 2 Finance 0.190 76652 11.5
## 3 Sales 0.330 75074 9.22
# Gather data for plotting
survey_gathered <- survey_summary %>%
gather(key = "measure", value = "value",
pct_disengaged, avg_salary, avg_vacation_taken)
# Create three bar charts
ggplot(survey_gathered, aes(x=measure, y=value, fill=department)) +
geom_col(position="dodge") +
facet_wrap(~ measure, scales="free")
# Add the in_sales variable
survey_sales <- survey %>%
mutate(in_sales = ifelse(department == "Sales", "Sales", "Other"),
disengaged = ifelse(engagement < 3, 1L, 0L)
)
# Test the hypothesis using survey_sales
chisq.test(survey_sales$disengaged, survey_sales$in_sales)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: survey_sales$disengaged and survey_sales$in_sales
## X-squared = 25.524, df = 1, p-value = 4.368e-07
t.test(disengaged ~ in_sales, data=survey_sales)
##
## Welch Two Sample t-test
##
## data: disengaged by in_sales
## t = -4.862, df = 743.16, p-value = 1.419e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.17479596 -0.07424062
## sample estimates:
## mean in group Other mean in group Sales
## 0.2050781 0.3295964
# Test the hypothesis using the survey_sales data
t.test(vacation_days_taken ~ in_sales, data = survey_sales)
##
## Welch Two Sample t-test
##
## data: vacation_days_taken by in_sales
## t = 8.1549, df = 1022.9, p-value = 1.016e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.229473 3.642409
## sample estimates:
## mean in group Other mean in group Sales
## 12.160156 9.224215
Chapter 3 - Are new hires getting paid too much?
Paying new hires fairly:
Omitted variable bias:
Linear regression helps to test the multivariate impacts of variables:
Example code includes:
# Import the data
pay <- readr::read_csv("./RInputFiles/fair_pay_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## salary = col_double(),
## new_hire = col_character(),
## job_level = col_character()
## )
# Get an overview of the data
summary(pay)
## employee_id department salary new_hire
## Min. : 1.0 Length:1470 Min. : 43820 Length:1470
## 1st Qu.: 491.2 Class :character 1st Qu.: 59378 Class :character
## Median :1020.5 Mode :character Median : 70425 Mode :character
## Mean :1024.9 Mean : 74142
## 3rd Qu.:1555.8 3rd Qu.: 84809
## Max. :2068.0 Max. :164073
## job_level
## Length:1470
## Class :character
## Mode :character
##
##
##
# Check average salary of new hires and non-new hires
pay %>%
group_by(new_hire) %>%
summarize(avg_salary=mean(salary))
## # A tibble: 2 x 2
## new_hire avg_salary
## <chr> <dbl>
## 1 No 73425
## 2 Yes 76074
# Perform the correct statistical test
t.test(salary ~ new_hire, data = pay)
##
## Welch Two Sample t-test
##
## data: salary by new_hire
## t = -2.3437, df = 685.16, p-value = 0.01938
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4869.4242 -429.9199
## sample estimates:
## mean in group No mean in group Yes
## 73424.60 76074.28
t.test(salary ~ new_hire, data = pay) %>%
broom::tidy()
## estimate estimate1 estimate2 statistic p.value parameter conf.low
## 1 -2649.672 73424.6 76074.28 -2.343708 0.01937799 685.1554 -4869.424
## conf.high method alternative
## 1 -429.9199 Welch Two Sample t-test two.sided
# Create a stacked bar chart
pay %>%
ggplot(aes(x=new_hire, fill=job_level)) +
geom_bar(position="fill")
# Calculate the average salary for each group of interest
pay_grouped <- pay %>%
group_by(new_hire, job_level) %>%
summarize(avg_salary = mean(salary))
# Graph the results using facet_wrap()
pay_grouped %>%
ggplot(aes(x=new_hire, y=avg_salary)) +
geom_col() +
facet_wrap(~ job_level)
# Filter the data to include only hourly employees
pay_filter <- pay %>%
filter(job_level == "Hourly")
# Test the difference in pay
t.test(salary ~ new_hire, data=pay_filter) %>%
broom::tidy()
## estimate estimate1 estimate2 statistic p.value parameter conf.low
## 1 -1106.967 63965.71 65072.68 -1.750387 0.08066517 499.7005 -2349.483
## conf.high method alternative
## 1 135.5483 Welch Two Sample t-test two.sided
# Run the simple regression
model_simple <- lm(salary ~ new_hire, data = pay)
# Display the summary of model_simple
model_simple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32255 -14466 -3681 10740 87998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73424.6 577.2 127.200 <2e-16 ***
## new_hireYes 2649.7 1109.4 2.388 0.017 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18900 on 1468 degrees of freedom
## Multiple R-squared: 0.003871, Adjusted R-squared: 0.003193
## F-statistic: 5.705 on 1 and 1468 DF, p-value: 0.01704
# Display a tidy summary
model_simple %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 73424.603 577.2369 127.200112 0.00000000
## 2 new_hireYes 2649.672 1109.3568 2.388476 0.01704414
# Run the multiple regression
model_multiple <- lm(salary ~ new_hire + job_level, data = pay)
# Display the summary of model_multiple
model_multiple %>%
summary()
##
## Call:
## lm(formula = salary ~ new_hire + job_level, data = pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21013 -7091 -425 6771 44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64049.3 308.3 207.722 <2e-16 ***
## new_hireYes 782.7 524.8 1.491 0.136
## job_levelManager 54918.8 915.3 60.001 <2e-16 ***
## job_levelSalaried 26865.6 567.2 47.369 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8930 on 1466 degrees of freedom
## Multiple R-squared: 0.7779, Adjusted R-squared: 0.7775
## F-statistic: 1712 on 3 and 1466 DF, p-value: < 2.2e-16
# Display a tidy summary
model_multiple %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) 64049.3417 308.3415 207.722067 0.000000e+00
## 2 new_hireYes 782.7358 524.8133 1.491456 1.360570e-01
## 3 job_levelManager 54918.8477 915.2966 60.001152 0.000000e+00
## 4 job_levelSalaried 26865.6336 567.1569 47.368962 7.387196e-298
Chapter 4 - Are performance ratings being given consistently?
Joining HR data:
Performance ratings and fairness:
Logistic regression is especially helpful for modeling binary response variables:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## department = col_character(),
## job_level = col_character(),
## gender = col_character()
## )
performance_data <- readr::read_csv("./RInputFiles/performance_data.csv")
## Parsed with column specification:
## cols(
## employee_id = col_integer(),
## rating = col_integer()
## )
# Examine the datasets
summary(hr_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
summary(performance_data)
## employee_id rating
## Min. : 1.0 Min. :1.00
## 1st Qu.: 491.2 1st Qu.:2.00
## Median :1020.5 Median :3.00
## Mean :1024.9 Mean :2.83
## 3rd Qu.:1555.8 3rd Qu.:4.00
## Max. :2068.0 Max. :5.00
# Join the two tables
joined_data <- left_join(hr_data, performance_data, by = "employee_id")
# Examine the result
summary(joined_data)
## employee_id department job_level gender
## Min. : 1.0 Length:1470 Length:1470 Length:1470
## 1st Qu.: 491.2 Class :character Class :character Class :character
## Median :1020.5 Mode :character Mode :character Mode :character
## Mean :1024.9
## 3rd Qu.:1555.8
## Max. :2068.0
## rating
## Min. :1.00
## 1st Qu.:2.00
## Median :3.00
## Mean :2.83
## 3rd Qu.:4.00
## Max. :5.00
# Check whether the average performance rating differs by gender
joined_data %>%
group_by(gender) %>%
summarize(avg_rating = mean(rating))
## # A tibble: 2 x 2
## gender avg_rating
## <chr> <dbl>
## 1 Female 2.75
## 2 Male 2.92
# Add the high_performer column
performance <- joined_data %>%
mutate(high_performer = ifelse(rating >= 4, 1, 0))
# Test whether one gender is more likely to be a high performer
chisq.test(performance$gender, performance$high_performer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: performance$gender and performance$high_performer
## X-squared = 22.229, df = 1, p-value = 2.42e-06
# Do the same test, and tidy the output
chisq.test(performance$gender, performance$high_performer) %>% broom::tidy()
## statistic p.value parameter
## 1 22.22915 2.419716e-06 1
## method
## 1 Pearson's Chi-squared test with Yates' continuity correction
# Visualize the distribution of high_performer by gender
performance %>%
ggplot(aes(x=gender, fill=factor(high_performer))) +
geom_bar(position="fill")
# Visualize the distribution of all ratings by gender
performance %>%
ggplot(aes(x=gender, fill=factor(rating))) +
geom_bar(position="fill")
# Visualize the distribution of job_level by gender
performance %>%
ggplot(aes(x = gender, fill = job_level)) +
geom_bar(position = "fill")
# Test whether men and women have different job level distributions
chisq.test(performance$gender, performance$job_level)
##
## Pearson's Chi-squared test
##
## data: performance$gender and performance$job_level
## X-squared = 44.506, df = 2, p-value = 2.166e-10
# Visualize the distribution of high_performer by gender, faceted by job level
performance %>%
ggplot(aes(x = gender, fill = factor(high_performer))) +
geom_bar(position = "fill") +
facet_wrap(~ job_level)
# Run a simple logistic regression
logistic_simple <- glm(high_performer ~ gender, family = "binomial", data = performance)
# View the result with summary()
logistic_simple %>%
summary()
##
## Call:
## glm(formula = high_performer ~ gender, family = "binomial", data = performance)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8871 -0.8871 -0.6957 1.4986 1.7535
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.29540 0.08813 -14.699 < 2e-16 ***
## genderMale 0.56596 0.11921 4.748 2.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1709.0 on 1469 degrees of freedom
## Residual deviance: 1686.1 on 1468 degrees of freedom
## AIC: 1690.1
##
## Number of Fisher Scoring iterations: 4
# View a tidy version of the result
logistic_simple %>%
broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -1.295395 0.0881306 -14.698586 6.581976e-49
## 2 genderMale 0.565958 0.1192110 4.747532 2.059140e-06
# Run a multiple logistic regression
logistic_multiple <- glm(high_performer ~ gender + job_level, family = "binomial", data = performance)
# View the result with summary() or tidy()
logistic_multiple %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -1.6945360 0.1025902 -16.517522 2.744425e-61
## 2 genderMale 0.3191103 0.1290197 2.473345 1.338548e-02
## 3 job_levelManager 2.7436430 0.2514410 10.911677 1.013683e-27
## 4 job_levelSalaried 1.0992751 0.1405230 7.822742 5.168508e-15
Chapter 5 - Improving employee safety with data
Employee safety - looking at accident rates and drivers:
Focusing on the location of interest:
Explaining the increase in accidents:
Wrap up:
Example code includes:
# Import the data
hr_data <- readr::read_csv("./RInputFiles/hr_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## location = col_character(),
## overtime_hours = col_integer()
## )
accident_data <- readr::read_csv("./RInputFiles/accident_data.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## accident_type = col_character()
## )
# Create hr_joined with left_join() and mutate()
hr_joined <- left_join(hr_data, accident_data, by=c("year", "employee_id")) %>%
mutate(had_accident=ifelse(is.na(accident_type), 0, 1))
hr_joined
## # A tibble: 2,940 x 6
## year employee_id location overtime_hours accident_type had_accident
## <int> <int> <chr> <int> <chr> <dbl>
## 1 2016 1 Northwood 14 <NA> 0
## 2 2017 1 Northwood 8 Mild 1.00
## 3 2016 2 East Valley 8 <NA> 0
## 4 2017 2 East Valley 11 <NA> 0
## 5 2016 4 East Valley 4 <NA> 0
## 6 2017 4 East Valley 2 Mild 1.00
## 7 2016 5 West River 0 <NA> 0
## 8 2017 5 West River 17 <NA> 0
## 9 2016 7 West River 21 <NA> 0
## 10 2017 7 West River 9 <NA> 0
## # ... with 2,930 more rows
# Find accident rate for each year
hr_joined %>%
group_by(year) %>%
summarize(accident_rate = mean(had_accident))
## # A tibble: 2 x 2
## year accident_rate
## <int> <dbl>
## 1 2016 0.0850
## 2 2017 0.120
# Test difference in accident rate between years
chisq.test(hr_joined$year, hr_joined$had_accident)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hr_joined$year and hr_joined$had_accident
## X-squared = 9.5986, df = 1, p-value = 0.001947
# Which location had the highest acccident rate?
hr_joined %>%
group_by(location) %>%
summarize(accident_rate=mean(had_accident)) %>%
arrange(-accident_rate)
## # A tibble: 4 x 2
## location accident_rate
## <chr> <dbl>
## 1 East Valley 0.128
## 2 Southfield 0.103
## 3 West River 0.0961
## 4 Northwood 0.0831
# Compare annual accident rates by location
accident_rates <- hr_joined %>%
group_by(location, year) %>%
summarize(accident_rate = mean(had_accident))
accident_rates
## # A tibble: 8 x 3
## # Groups: location [?]
## location year accident_rate
## <chr> <int> <dbl>
## 1 East Valley 2016 0.113
## 2 East Valley 2017 0.143
## 3 Northwood 2016 0.0644
## 4 Northwood 2017 0.102
## 5 Southfield 2016 0.0764
## 6 Southfield 2017 0.130
## 7 West River 2016 0.0824
## 8 West River 2017 0.110
# Graph it
accident_rates %>%
ggplot(aes(factor(year), accident_rate)) +
geom_col() +
facet_wrap(~location)
# Filter out the other locations
southfield <- hr_joined %>%
filter(location == "Southfield")
# Find the average overtime hours worked by year
southfield %>%
group_by(year) %>%
summarize(average_overtime_hours = mean(overtime_hours))
## # A tibble: 2 x 2
## year average_overtime_hours
## <int> <dbl>
## 1 2016 8.67
## 2 2017 9.97
# Test difference in Southfield's overtime hours between years
t.test(overtime_hours ~ year, data=southfield)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -1.6043, df = 595.46, p-value = 0.1092
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.904043 0.292747
## sample estimates:
## mean in group 2016 mean in group 2017
## 8.667774 9.973422
# Import the survey data
survey_data <- readr::read_csv("./RInputFiles/survey_data_2.csv")
## Parsed with column specification:
## cols(
## year = col_integer(),
## employee_id = col_integer(),
## engagement = col_integer()
## )
# Create the safety dataset
safety <- left_join(hr_joined, survey_data, by=c("employee_id", "year")) %>%
mutate(disengaged=ifelse(engagement <= 2, 1, 0), year=factor(year))
# Visualize the difference in % disengaged by year in Southfield
safety %>%
filter(location=="Southfield") %>%
ggplot(aes(x = year, fill = factor(disengaged))) +
geom_bar(position = "fill")
# Test whether one year had significantly more disengaged employees
southSafety <- safety %>%
filter(location=="Southfield")
chisq.test(southSafety$disengaged, southSafety$year)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: southSafety$disengaged and southSafety$year
## X-squared = 7.1906, df = 1, p-value = 0.007329
# Filter out Southfield
other_locs <- safety %>%
filter(location != "Southfield")
# Test whether one year had significantly more overtime hours worked
t.test(overtime_hours ~ year, data = other_locs)
##
## Welch Two Sample t-test
##
## data: overtime_hours by year
## t = -0.48267, df = 2320.3, p-value = 0.6294
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.9961022 0.6026035
## sample estimates:
## mean in group 2016 mean in group 2017
## 9.278015 9.474765
# Test whether one year had significantly more disengaged employees
chisq.test(other_locs$year, other_locs$disengaged)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: other_locs$year and other_locs$disengaged
## X-squared = 0.0023091, df = 1, p-value = 0.9617
# Use multiple regression to test the impact of year and disengaged on accident rate in Southfield
regression <- glm(had_accident ~ year + disengaged, family = "binomial", data = southSafety)
# Examine the output
regression %>% broom::tidy()
## term estimate std.error statistic p.value
## 1 (Intercept) -2.9237983 0.2504685 -11.673318 1.744826e-31
## 2 year2017 0.4402062 0.2848346 1.545480 1.222302e-01
## 3 disengaged 1.4425408 0.2780927 5.187265 2.134047e-07
Chapter 1 - Cars Data
Making predictions using machine learning:
Getting started with caret:
Sampling data:
Example code includes:
cars2018 <- readr::read_csv("./RInputFiles/cars2018.csv")
## Parsed with column specification:
## cols(
## Model = col_character(),
## `Model Index` = col_integer(),
## Displacement = col_double(),
## Cylinders = col_integer(),
## Gears = col_integer(),
## Transmission = col_character(),
## MPG = col_integer(),
## Aspiration = col_character(),
## `Lockup Torque Converter` = col_character(),
## Drive = col_character(),
## `Max Ethanol` = col_integer(),
## `Recommended Fuel` = col_character(),
## `Intake Valves Per Cyl` = col_integer(),
## `Exhaust Valves Per Cyl` = col_integer(),
## `Fuel injection` = col_character()
## )
str(cars2018, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1144 obs. of 15 variables:
## $ Model : chr "Acura NSX" "ALFA ROMEO 4C" "Audi R8 AWD" "Audi R8 RWD" ...
## $ Model Index : int 57 410 65 71 66 72 46 488 38 278 ...
## $ Displacement : num 3.5 1.8 5.2 5.2 5.2 5.2 2 3 8 6.2 ...
## $ Cylinders : int 6 4 10 10 10 10 4 6 16 8 ...
## $ Gears : int 9 6 7 7 7 7 6 7 7 8 ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ MPG : int 21 28 17 18 17 18 26 20 11 18 ...
## $ Aspiration : chr "Turbocharged/Supercharged" "Turbocharged/Supercharged" "Naturally Aspirated" "Naturally Aspirated" ...
## $ Lockup Torque Converter: chr "Y" "Y" "Y" "Y" ...
## $ Drive : chr "All Wheel Drive" "2-Wheel Drive, Rear" "All Wheel Drive" "2-Wheel Drive, Rear" ...
## $ Max Ethanol : int 10 10 15 15 15 15 15 10 15 10 ...
## $ Recommended Fuel : chr "Premium Unleaded Required" "Premium Unleaded Required" "Premium Unleaded Recommended" "Premium Unleaded Recommended" ...
## $ Intake Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Exhaust Valves Per Cyl : int 2 2 2 2 2 2 2 2 2 1 ...
## $ Fuel injection : chr "Direct ignition" "Direct ignition" "Direct ignition" "Direct ignition" ...
summary(cars2018)
## Model Model Index Displacement Cylinders
## Length:1144 Min. : 1.0 Min. :1.000 Min. : 3.000
## Class :character 1st Qu.: 36.0 1st Qu.:2.000 1st Qu.: 4.000
## Mode :character Median :108.0 Median :3.000 Median : 6.000
## Mean :201.3 Mean :3.087 Mean : 5.564
## 3rd Qu.:323.8 3rd Qu.:3.600 3rd Qu.: 6.000
## Max. :821.0 Max. :8.000 Max. :16.000
## Gears Transmission MPG Aspiration
## Min. : 1.000 Length:1144 Min. :11.0 Length:1144
## 1st Qu.: 6.000 Class :character 1st Qu.:19.0 Class :character
## Median : 7.000 Mode :character Median :23.0 Mode :character
## Mean : 6.935 Mean :23.2
## 3rd Qu.: 8.000 3rd Qu.:26.0
## Max. :10.000 Max. :58.0
## Lockup Torque Converter Drive Max Ethanol
## Length:1144 Length:1144 Min. :10.00
## Class :character Class :character 1st Qu.:10.00
## Mode :character Mode :character Median :10.00
## Mean :15.29
## 3rd Qu.:15.00
## Max. :85.00
## Recommended Fuel Intake Valves Per Cyl Exhaust Valves Per Cyl
## Length:1144 Min. :1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Median :2.000 Median :2.000
## Mean :1.926 Mean :1.922
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :2.000 Max. :2.000
## Fuel injection
## Length:1144
## Class :character
## Mode :character
##
##
##
# Print the cars2018 object
cars2018
## # A tibble: 1,144 x 15
## Model `Model Index` Displacement Cylinders Gears Transmission MPG
## <chr> <int> <dbl> <int> <int> <chr> <int>
## 1 Acura NSX 57 3.50 6 9 Manual 21
## 2 ALFA ROM~ 410 1.80 4 6 Manual 28
## 3 Audi R8 ~ 65 5.20 10 7 Manual 17
## 4 Audi R8 ~ 71 5.20 10 7 Manual 18
## 5 Audi R8 ~ 66 5.20 10 7 Manual 17
## 6 Audi R8 ~ 72 5.20 10 7 Manual 18
## 7 Audi TT ~ 46 2.00 4 6 Manual 26
## 8 BMW M4 D~ 488 3.00 6 7 Manual 20
## 9 Bugatti ~ 38 8.00 16 7 Manual 11
## 10 Chevrole~ 278 6.20 8 8 Automatic 18
## # ... with 1,134 more rows, and 8 more variables: Aspiration <chr>,
## # `Lockup Torque Converter` <chr>, Drive <chr>, `Max Ethanol` <int>,
## # `Recommended Fuel` <chr>, `Intake Valves Per Cyl` <int>, `Exhaust
## # Valves Per Cyl` <int>, `Fuel injection` <chr>
# Plot the histogram
ggplot(cars2018, aes(x = MPG)) +
geom_histogram(bins = 25) +
labs(y = "Number of cars",
x = "Fuel efficiency (mpg)")
# Deselect the 2 columns to create cars_vars
cars_vars <- cars2018 %>%
select(-Model, -`Model Index`)
# Fit a linear model
fit_all <- lm(MPG ~ ., data = cars_vars)
# Print the summary of the model
summary(fit_all)
##
## Call:
## lm(formula = MPG ~ ., data = cars_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5261 -1.6473 -0.1096 1.3572 26.5045
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 44.539519 1.176283
## Displacement -3.786147 0.264845
## Cylinders 0.520284 0.161802
## Gears 0.157674 0.069984
## TransmissionCVT 4.877637 0.404051
## TransmissionManual -1.074608 0.366075
## AspirationTurbocharged/Supercharged -2.190248 0.267559
## `Lockup Torque Converter`Y -2.624494 0.381252
## Drive2-Wheel Drive, Rear -2.676716 0.291044
## Drive4-Wheel Drive -3.397532 0.335147
## DriveAll Wheel Drive -2.941084 0.257174
## `Max Ethanol` -0.007377 0.005898
## `Recommended Fuel`Premium Unleaded Required -0.403935 0.262413
## `Recommended Fuel`Regular Unleaded Recommended -0.996343 0.272495
## `Intake Valves Per Cyl` -1.446107 1.620575
## `Exhaust Valves Per Cyl` -2.469747 1.547748
## `Fuel injection`Multipoint/sequential ignition -0.658428 0.243819
## t value Pr(>|t|)
## (Intercept) 37.865 < 2e-16 ***
## Displacement -14.296 < 2e-16 ***
## Cylinders 3.216 0.001339 **
## Gears 2.253 0.024450 *
## TransmissionCVT 12.072 < 2e-16 ***
## TransmissionManual -2.935 0.003398 **
## AspirationTurbocharged/Supercharged -8.186 7.24e-16 ***
## `Lockup Torque Converter`Y -6.884 9.65e-12 ***
## Drive2-Wheel Drive, Rear -9.197 < 2e-16 ***
## Drive4-Wheel Drive -10.137 < 2e-16 ***
## DriveAll Wheel Drive -11.436 < 2e-16 ***
## `Max Ethanol` -1.251 0.211265
## `Recommended Fuel`Premium Unleaded Required -1.539 0.124010
## `Recommended Fuel`Regular Unleaded Recommended -3.656 0.000268 ***
## `Intake Valves Per Cyl` -0.892 0.372400
## `Exhaust Valves Per Cyl` -1.596 0.110835
## `Fuel injection`Multipoint/sequential ignition -2.700 0.007028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.916 on 1127 degrees of freedom
## Multiple R-squared: 0.7314, Adjusted R-squared: 0.7276
## F-statistic: 191.8 on 16 and 1127 DF, p-value: < 2.2e-16
# Load caret
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Split the data into training and test sets
set.seed(1234)
in_train <- createDataPartition(cars_vars$Transmission, p = 0.8, list = FALSE)
training <- cars_vars[in_train, ]
testing <- cars_vars[-in_train, ]
# Train a linear regression model
fit_lm <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_lm
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Train a random forest model
fit_rf <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "none"))
# Print the model object
fit_rf
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: None
# Create the new columns
results <- training %>%
mutate(`Linear regression` = predict(fit_lm, training),
`Random forest` = predict(fit_rf, training))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.702
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.9 0.845
# Create the new columns
results <- testing %>%
mutate(`Linear regression` = predict(fit_lm, testing),
`Random forest` = predict(fit_rf, testing))
# Evaluate the performance
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.880
# Fit the models with bootstrap resampling
cars_lm_bt <- train(log(MPG) ~ ., method = "lm", data = training,
trControl = trainControl(method = "boot"))
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
cars_rf_bt <- train(log(MPG) ~ ., method = "rf", data = training,
trControl = trainControl(method = "boot"))
# Quick look at the models
cars_lm_bt
## Linear Regression
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1036278 0.7890514 0.07656104
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cars_rf_bt
## Random Forest
##
## 916 samples
## 12 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 916, 916, 916, 916, 916, 916, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 0.10015480 0.8205322 0.07299305
## 9 0.08758544 0.8466598 0.06129895
## 16 0.09100659 0.8360034 0.06313542
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 9.
results <- testing %>%
mutate(`Linear regression` = predict(cars_lm_bt, testing),
`Random forest` = predict(cars_rf_bt, testing))
yardstick::metrics(results, truth = MPG, estimate = `Linear regression`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.799
yardstick::metrics(results, truth = MPG, estimate = `Random forest`)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 20.5 0.903
results %>%
gather(Method, Result, `Linear regression`:`Random forest`) %>%
ggplot(aes(log(MPG), Result, color = Method)) +
geom_point(size = 1.5, alpha = 0.5) +
facet_wrap(~Method) +
geom_abline(lty = 2, color = "gray50") +
geom_smooth(method = "lm")
Chapter 2 - Stack Overflow Developer Data
Essential copying and pasting from Stack Overflow (largest and most trusted developer community):
Dealing with imbalanced data:
Predicting remote status:
Logistic regression)Logistic regression)Logistic regression)Example code includes:
stackoverflow <- readr::read_csv("./RInputFiles/stackoverflow.csv")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## Respondent = col_integer(),
## Country = col_character(),
## Salary = col_double(),
## YearsCodedJob = col_integer(),
## CompanySizeNumber = col_double(),
## Remote = col_character(),
## CareerSatisfaction = col_integer()
## )
## See spec(...) for full column specifications.
stackoverflow$Remote <- factor(stackoverflow$Remote, levels=c("Not remote", "Remote"))
str(stackoverflow, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6991 obs. of 22 variables:
## $ Respondent : int 3 15 18 19 26 55 62 71 73 77 ...
## $ Country : chr "United Kingdom" "United Kingdom" "United States" "United States" ...
## $ Salary : num 113750 100000 130000 82500 175000 ...
## $ YearsCodedJob : int 20 20 20 3 16 4 1 1 20 20 ...
## $ OpenSource : logi TRUE FALSE TRUE FALSE FALSE FALSE ...
## $ Hobby : logi TRUE TRUE TRUE TRUE TRUE FALSE ...
## $ CompanySizeNumber : num 10000 5000 1000 10000 10000 1000 5000 20 100 1000 ...
## $ Remote : Factor w/ 2 levels "Not remote","Remote": 1 2 2 1 1 1 1 1 2 2 ...
## $ CareerSatisfaction : int 8 8 9 5 7 9 5 8 8 10 ...
## $ Data scientist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Database administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Desktop applications developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Developer with stats/math background: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ DevOps : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Embedded developer : logi FALSE TRUE TRUE FALSE FALSE FALSE ...
## $ Graphic designer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Graphics programming : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Machine learning specialist : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Mobile developer : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Quality assurance engineer : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ Systems administrator : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Web developer : logi FALSE FALSE TRUE TRUE TRUE TRUE ...
# Print stackoverflow
stackoverflow
## # A tibble: 6,991 x 22
## Respondent Country Salary YearsCodedJob OpenSource Hobby
## <int> <chr> <dbl> <int> <lgl> <lgl>
## 1 3 United Kingdom 113750 20 T T
## 2 15 United Kingdom 100000 20 F T
## 3 18 United States 130000 20 T T
## 4 19 United States 82500 3 F T
## 5 26 United States 175000 16 F T
## 6 55 Germany 64516 4 F F
## 7 62 India 6636 1 F T
## 8 71 United States 65000 1 F T
## 9 73 United States 120000 20 T T
## 10 77 United States 96283 20 T T
## # ... with 6,981 more rows, and 16 more variables:
## # CompanySizeNumber <dbl>, Remote <fct>, CareerSatisfaction <int>, `Data
## # scientist` <lgl>, `Database administrator` <lgl>, `Desktop
## # applications developer` <lgl>, `Developer with stats/math
## # background` <lgl>, DevOps <lgl>, `Embedded developer` <lgl>, `Graphic
## # designer` <lgl>, `Graphics programming` <lgl>, `Machine learning
## # specialist` <lgl>, `Mobile developer` <lgl>, `Quality assurance
## # engineer` <lgl>, `Systems administrator` <lgl>, `Web developer` <lgl>
# First count for Remote
stackoverflow %>%
count(Remote, sort = TRUE)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 6273
## 2 Remote 718
# then count for Country
stackoverflow %>%
count(Country, sort = TRUE)
## # A tibble: 5 x 2
## Country n
## <chr> <int>
## 1 United States 3486
## 2 United Kingdom 1270
## 3 Germany 950
## 4 India 666
## 5 Canada 619
ggplot(stackoverflow, aes(x=Remote, y=YearsCodedJob)) +
geom_boxplot() +
labs(x = NULL,
y = "Years of professional coding experience")
# Build a simple logistic regression model
simple_glm <- stackoverflow %>%
select(-Respondent) %>%
glm(Remote ~ .,
family = "binomial",
data = .)
# Print the summary of the model
summary(simple_glm)
##
## Call:
## glm(formula = Remote ~ ., family = "binomial", data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1942 -0.4971 -0.3824 -0.2867 2.9118
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -4.156e+00 2.929e-01 -14.187
## CountryGermany -2.034e-01 2.196e-01 -0.927
## CountryIndia 9.574e-01 2.220e-01 4.312
## CountryUnited Kingdom 5.599e-02 1.974e-01 0.284
## CountryUnited States 5.990e-01 1.799e-01 3.330
## Salary 4.076e-06 1.589e-06 2.565
## YearsCodedJob 7.133e-02 7.556e-03 9.440
## OpenSourceTRUE 4.207e-01 8.555e-02 4.917
## HobbyTRUE 8.330e-03 9.827e-02 0.085
## CompanySizeNumber -6.104e-05 1.223e-05 -4.990
## CareerSatisfaction 6.748e-02 2.664e-02 2.533
## `Data scientist`TRUE -1.186e-01 1.838e-01 -0.645
## `Database administrator`TRUE 2.763e-01 1.267e-01 2.181
## `Desktop applications developer`TRUE -2.903e-01 9.842e-02 -2.950
## `Developer with stats/math background`TRUE 2.840e-02 1.359e-01 0.209
## DevOpsTRUE -1.532e-01 1.292e-01 -1.185
## `Embedded developer`TRUE -2.777e-01 1.653e-01 -1.680
## `Graphic designer`TRUE -1.904e-01 2.725e-01 -0.699
## `Graphics programming`TRUE 1.078e-01 2.312e-01 0.466
## `Machine learning specialist`TRUE -2.289e-01 2.769e-01 -0.827
## `Mobile developer`TRUE 2.170e-01 1.019e-01 2.130
## `Quality assurance engineer`TRUE -2.826e-01 2.437e-01 -1.160
## `Systems administrator`TRUE 1.462e-01 1.421e-01 1.029
## `Web developer`TRUE 1.158e-01 9.993e-02 1.159
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## CountryGermany 0.354161
## CountryIndia 1.62e-05 ***
## CountryUnited Kingdom 0.776710
## CountryUnited States 0.000868 ***
## Salary 0.010314 *
## YearsCodedJob < 2e-16 ***
## OpenSourceTRUE 8.78e-07 ***
## HobbyTRUE 0.932444
## CompanySizeNumber 6.04e-07 ***
## CareerSatisfaction 0.011323 *
## `Data scientist`TRUE 0.518709
## `Database administrator`TRUE 0.029184 *
## `Desktop applications developer`TRUE 0.003178 **
## `Developer with stats/math background`TRUE 0.834400
## DevOpsTRUE 0.235833
## `Embedded developer`TRUE 0.093039 .
## `Graphic designer`TRUE 0.484596
## `Graphics programming`TRUE 0.641060
## `Machine learning specialist`TRUE 0.408484
## `Mobile developer`TRUE 0.033194 *
## `Quality assurance engineer`TRUE 0.246098
## `Systems administrator`TRUE 0.303507
## `Web developer`TRUE 0.246655
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4627.8 on 6990 degrees of freedom
## Residual deviance: 4268.8 on 6967 degrees of freedom
## AIC: 4316.8
##
## Number of Fisher Scoring iterations: 5
stack_select <- stackoverflow %>%
select(-Respondent)
# Split the data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(stack_select$Remote, p=0.8, list = FALSE)
training <- stack_select[in_train,]
testing <- stack_select[-in_train,]
up_train <- caret::upSample(x = select(training, -Remote), y = training$Remote, yname = "Remote") %>%
as_tibble()
up_train %>%
count(Remote)
## # A tibble: 2 x 2
## Remote n
## <fct> <int>
## 1 Not remote 5019
## 2 Remote 5019
# Sub-sample to 5% of original
inUse <- sample(1:nrow(training), round(0.05*nrow(training)), replace=FALSE)
useTrain <- training[sort(inUse), ]
# Build a logistic regression model
stack_glm <- caret::train(Remote ~ ., method="glm", family="binomial", data = training,
trControl = trainControl(method = "boot", sampling = "up")
)
# Print the model object
stack_glm
## Generalized Linear Model
##
## 5594 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5594, 5594, 5594, 5594, 5594, 5594, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.6568743 0.1279825
# Build a random forest model
stack_rf <- caret::train(Remote ~ ., method="rf", data = useTrain,
trControl = trainControl(method = "boot", sampling="up")
)
# Print the model object
stack_rf
## Random Forest
##
## 280 samples
## 20 predictor
## 2 classes: 'Not remote', 'Remote'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 280, 280, 280, 280, 280, 280, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8626254 0.110738058
## 12 0.9038825 -0.002127159
## 23 0.8887612 0.035777206
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
# Confusion matrix for logistic regression model
caret::confusionMatrix(predict(stack_glm, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 837 53
## Remote 417 90
##
## Accuracy : 0.6636
## 95% CI : (0.6381, 0.6883)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1395
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6675
## Specificity : 0.6294
## Pos Pred Value : 0.9404
## Neg Pred Value : 0.1775
## Prevalence : 0.8976
## Detection Rate : 0.5991
## Detection Prevalence : 0.6371
## Balanced Accuracy : 0.6484
##
## 'Positive' Class : Not remote
##
# Confusion matrix for random forest model
caret::confusionMatrix(predict(stack_rf, testing), testing$Remote)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Not remote Remote
## Not remote 1207 125
## Remote 47 18
##
## Accuracy : 0.8769
## 95% CI : (0.8585, 0.8937)
## No Information Rate : 0.8976
## P-Value [Acc > NIR] : 0.9945
##
## Kappa : 0.1166
## Mcnemar's Test P-Value : 4.327e-09
##
## Sensitivity : 0.9625
## Specificity : 0.1259
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.2769
## Prevalence : 0.8976
## Detection Rate : 0.8640
## Detection Prevalence : 0.9535
## Balanced Accuracy : 0.5442
##
## 'Positive' Class : Not remote
##
# Predict values
testing_results <- testing %>%
mutate(`Logistic regression` = predict(stack_glm, testing), `Random forest` = predict(stack_rf, testing))
## Calculate accuracy
yardstick::accuracy(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.6635648
yardstick::accuracy(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.876879
## Calculate positive predict value
yardstick::ppv(testing_results, truth = Remote, estimate = `Logistic regression`)
## [1] 0.9404494
yardstick::ppv(testing_results, truth = Remote, estimate = `Random forest`)
## [1] 0.9061562
Chapter 3 - Voting
Predicting voter turnout from survey data:
Vote 2016:
Cross-validation is the process of sub-dividing the data into folds, with each fold used once as the validation set:
Comparing model performance:
Example code includes:
voters <- readr::read_csv("./RInputFiles/voters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## turnout16_2016 = col_character()
## )
## See spec(...) for full column specifications.
voters$turnout16_2016 <- factor(voters$turnout16_2016, levels=c("Did not vote", "Voted"))
str(voters, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6692 obs. of 43 variables:
## $ case_identifier : int 779 2108 2597 4148 4460 5225 5903 6059 8048 13112 ...
## $ turnout16_2016 : Factor w/ 2 levels "Did not vote",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ RIGGED_SYSTEM_1_2016: int 3 2 2 1 3 3 3 2 4 2 ...
## $ RIGGED_SYSTEM_2_2016: int 4 1 4 4 1 3 4 3 4 3 ...
## $ RIGGED_SYSTEM_3_2016: int 1 3 1 1 3 2 1 3 1 1 ...
## $ RIGGED_SYSTEM_4_2016: int 4 1 4 4 1 2 1 2 3 2 ...
## $ RIGGED_SYSTEM_5_2016: int 3 3 1 2 3 2 2 1 3 2 ...
## $ RIGGED_SYSTEM_6_2016: int 2 2 1 1 2 3 1 2 1 2 ...
## $ track_2016 : int 2 2 1 1 2 2 1 2 2 2 ...
## $ persfinretro_2016 : int 2 3 3 1 2 2 2 3 2 1 ...
## $ econtrend_2016 : int 1 3 3 1 2 2 1 3 1 1 ...
## $ Americatrend_2016 : int 1 1 1 3 3 1 2 3 2 1 ...
## $ futuretrend_2016 : int 4 1 1 3 4 3 1 3 1 1 ...
## $ wealth_2016 : int 2 1 2 2 1 2 2 1 2 2 ...
## $ values_culture_2016 : int 2 3 3 3 3 2 3 3 1 3 ...
## $ US_respect_2016 : int 2 3 1 1 2 2 2 3 3 3 ...
## $ trustgovt_2016 : int 3 3 3 3 3 2 3 3 3 3 ...
## $ trust_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ helpful_people_2016 : int 1 1 2 1 1 1 2 2 1 2 ...
## $ fair_people_2016 : int 8 2 1 1 1 2 2 1 2 1 ...
## $ imiss_a_2016 : int 2 1 1 1 1 2 1 1 3 1 ...
## $ imiss_b_2016 : int 2 1 1 2 1 1 1 2 1 1 ...
## $ imiss_c_2016 : int 1 2 2 3 1 2 2 1 4 2 ...
## $ imiss_d_2016 : int 1 2 1 1 1 1 1 2 1 1 ...
## $ imiss_e_2016 : int 1 1 3 1 1 3 1 2 1 1 ...
## $ imiss_f_2016 : int 2 1 1 2 1 2 1 3 2 1 ...
## $ imiss_g_2016 : int 1 4 3 3 3 1 3 4 2 2 ...
## $ imiss_h_2016 : int 1 2 2 2 1 1 1 2 1 1 ...
## $ imiss_i_2016 : int 2 2 4 4 2 1 1 3 2 1 ...
## $ imiss_j_2016 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ imiss_k_2016 : int 1 2 1 1 2 1 1 4 2 1 ...
## $ imiss_l_2016 : int 1 4 1 2 4 1 1 3 1 1 ...
## $ imiss_m_2016 : int 1 2 1 2 1 1 1 1 1 1 ...
## $ imiss_n_2016 : int 1 2 1 1 1 1 1 2 2 1 ...
## $ imiss_o_2016 : int 2 1 1 1 1 2 1 2 2 1 ...
## $ imiss_p_2016 : int 2 1 2 3 1 3 1 1 4 1 ...
## $ imiss_q_2016 : int 1 1 1 2 2 1 1 4 2 1 ...
## $ imiss_r_2016 : int 2 1 1 2 1 2 1 2 4 2 ...
## $ imiss_s_2016 : int 1 2 1 2 2 1 1 1 1 1 ...
## $ imiss_t_2016 : int 1 1 3 3 1 1 3 4 1 1 ...
## $ imiss_u_2016 : int 2 2 2 2 1 3 3 1 4 2 ...
## $ imiss_x_2016 : int 1 3 1 2 1 1 1 4 1 1 ...
## $ imiss_y_2016 : int 1 4 2 3 1 1 1 3 2 1 ...
# Print voters
voters
## # A tibble: 6,692 x 43
## case_identifier turnout16_2016 RIGGED_SYSTEM_1_2016 RIGGED_SYSTEM_2_20~
## <int> <fct> <int> <int>
## 1 779 Voted 3 4
## 2 2108 Voted 2 1
## 3 2597 Voted 2 4
## 4 4148 Voted 1 4
## 5 4460 Voted 3 1
## 6 5225 Voted 3 3
## 7 5903 Voted 3 4
## 8 6059 Voted 2 3
## 9 8048 Voted 4 4
## 10 13112 Voted 2 3
## # ... with 6,682 more rows, and 39 more variables:
## # RIGGED_SYSTEM_3_2016 <int>, RIGGED_SYSTEM_4_2016 <int>,
## # RIGGED_SYSTEM_5_2016 <int>, RIGGED_SYSTEM_6_2016 <int>,
## # track_2016 <int>, persfinretro_2016 <int>, econtrend_2016 <int>,
## # Americatrend_2016 <int>, futuretrend_2016 <int>, wealth_2016 <int>,
## # values_culture_2016 <int>, US_respect_2016 <int>,
## # trustgovt_2016 <int>, trust_people_2016 <int>,
## # helpful_people_2016 <int>, fair_people_2016 <int>, imiss_a_2016 <int>,
## # imiss_b_2016 <int>, imiss_c_2016 <int>, imiss_d_2016 <int>,
## # imiss_e_2016 <int>, imiss_f_2016 <int>, imiss_g_2016 <int>,
## # imiss_h_2016 <int>, imiss_i_2016 <int>, imiss_j_2016 <int>,
## # imiss_k_2016 <int>, imiss_l_2016 <int>, imiss_m_2016 <int>,
## # imiss_n_2016 <int>, imiss_o_2016 <int>, imiss_p_2016 <int>,
## # imiss_q_2016 <int>, imiss_r_2016 <int>, imiss_s_2016 <int>,
## # imiss_t_2016 <int>, imiss_u_2016 <int>, imiss_x_2016 <int>,
## # imiss_y_2016 <int>
# How many people voted?
voters %>%
count(turnout16_2016)
## # A tibble: 2 x 2
## turnout16_2016 n
## <fct> <int>
## 1 Did not vote 264
## 2 Voted 6428
# How do the reponses on the survey vary with voting behavior?
voters %>%
group_by(turnout16_2016) %>%
summarize(`Elections don't matter` = mean(RIGGED_SYSTEM_1_2016 <= 2),
`Economy is getting better` = mean(econtrend_2016 == 1),
`Crime is very important` = mean(imiss_a_2016 == 2))
## # A tibble: 2 x 4
## turnout16_2016 `Elections don't ~ `Economy is gettin~ `Crime is very im~
## <fct> <dbl> <dbl> <dbl>
## 1 Did not vote 0.553 0.163 0.292
## 2 Voted 0.341 0.289 0.342
## Visualize difference by voter turnout
voters %>%
ggplot(aes(econtrend_2016, ..density.., fill = turnout16_2016)) +
geom_histogram(alpha = 0.5, position = "identity", binwidth = 1) +
labs(title = "Overall, is the economy getting better or worse?")
# Remove the case_indetifier column
voters_select <- voters %>%
select(-case_identifier)
# Build a simple logistic regression model
simple_glm <- glm(turnout16_2016 ~ ., family = "binomial",
data = voters_select)
# Print the summary
summary(simple_glm)
##
## Call:
## glm(formula = turnout16_2016 ~ ., family = "binomial", data = voters_select)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2373 0.1651 0.2214 0.3004 1.7708
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.457036 0.732721 3.353 0.000799 ***
## RIGGED_SYSTEM_1_2016 0.236284 0.085081 2.777 0.005484 **
## RIGGED_SYSTEM_2_2016 0.064749 0.089208 0.726 0.467946
## RIGGED_SYSTEM_3_2016 0.049357 0.107352 0.460 0.645680
## RIGGED_SYSTEM_4_2016 -0.074694 0.087583 -0.853 0.393749
## RIGGED_SYSTEM_5_2016 0.190252 0.096454 1.972 0.048556 *
## RIGGED_SYSTEM_6_2016 -0.005881 0.101381 -0.058 0.953740
## track_2016 0.241075 0.121467 1.985 0.047178 *
## persfinretro_2016 -0.040229 0.106714 -0.377 0.706191
## econtrend_2016 -0.295370 0.087224 -3.386 0.000708 ***
## Americatrend_2016 -0.105213 0.080845 -1.301 0.193116
## futuretrend_2016 0.210568 0.071201 2.957 0.003103 **
## wealth_2016 -0.069405 0.026344 -2.635 0.008424 **
## values_culture_2016 -0.041402 0.038670 -1.071 0.284332
## US_respect_2016 -0.068200 0.043785 -1.558 0.119322
## trustgovt_2016 0.315354 0.166655 1.892 0.058456 .
## trust_people_2016 0.040423 0.041518 0.974 0.330236
## helpful_people_2016 -0.037513 0.035353 -1.061 0.288643
## fair_people_2016 -0.017081 0.030170 -0.566 0.571294
## imiss_a_2016 0.397121 0.138987 2.857 0.004273 **
## imiss_b_2016 -0.250803 0.155454 -1.613 0.106667
## imiss_c_2016 0.017536 0.090647 0.193 0.846606
## imiss_d_2016 0.043510 0.122118 0.356 0.721619
## imiss_e_2016 -0.095552 0.078603 -1.216 0.224126
## imiss_f_2016 -0.323280 0.105432 -3.066 0.002168 **
## imiss_g_2016 -0.332034 0.078673 -4.220 2.44e-05 ***
## imiss_h_2016 -0.157298 0.107111 -1.469 0.141954
## imiss_i_2016 0.088695 0.091467 0.970 0.332196
## imiss_j_2016 0.060271 0.138429 0.435 0.663280
## imiss_k_2016 -0.181030 0.082726 -2.188 0.028646 *
## imiss_l_2016 0.274689 0.106781 2.572 0.010098 *
## imiss_m_2016 -0.124269 0.147888 -0.840 0.400746
## imiss_n_2016 -0.441612 0.090040 -4.905 9.36e-07 ***
## imiss_o_2016 0.198635 0.143160 1.388 0.165286
## imiss_p_2016 0.102987 0.105669 0.975 0.329751
## imiss_q_2016 0.244567 0.119093 2.054 0.040017 *
## imiss_r_2016 0.198839 0.121969 1.630 0.103050
## imiss_s_2016 -0.067310 0.134465 -0.501 0.616666
## imiss_t_2016 -0.116757 0.068143 -1.713 0.086639 .
## imiss_u_2016 0.022902 0.097312 0.235 0.813939
## imiss_x_2016 -0.017789 0.097349 -0.183 0.855003
## imiss_y_2016 0.150205 0.094536 1.589 0.112092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2224.3 on 6691 degrees of freedom
## Residual deviance: 2004.4 on 6650 degrees of freedom
## AIC: 2088.4
##
## Number of Fisher Scoring iterations: 6
# Split data into training and testing sets
set.seed(1234)
in_train <- caret::createDataPartition(voters_select$turnout16_2016, p = 0.8, list = FALSE)
training <- voters_select[in_train, ]
testing <- voters_select[-in_train, ]
# Perform logistic regression with upsampling and no resampling
vote_glm_1 <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = training,
trControl = trainControl(method = "none", sampling = "up")
)
# Print vote_glm
vote_glm_1
## Generalized Linear Model
##
## 5355 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: None
## Addtional sampling using up-sampling
useSmall <- sort(sample(1:nrow(training), round(0.1*nrow(training)), replace=FALSE))
trainSmall <- training[useSmall, ]
# Logistic regression
vote_glm <- caret::train(turnout16_2016 ~ ., method = "glm", family = "binomial", data = trainSmall,
trControl = trainControl(method = "repeatedcv", repeats = 2, sampling = "up")
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print vote_glm
vote_glm
## Generalized Linear Model
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 482, 482, 482, 483, 482, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results:
##
## Accuracy Kappa
## 0.8713138 0.04298445
# Random forest
vote_rf <- caret::train(turnout16_2016 ~ ., method = "rf", data = trainSmall,
trControl = trainControl(method="repeatedcv", repeats=2, sampling = "up")
)
# Print vote_rf
vote_rf
## Random Forest
##
## 536 samples
## 41 predictor
## 2 classes: 'Did not vote', 'Voted'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 2 times)
## Summary of sample sizes: 483, 482, 483, 482, 483, 483, ...
## Addtional sampling using up-sampling
##
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9674179 -0.001265823
## 21 0.9627184 -0.006073829
## 41 0.9542628 0.019107234
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Confusion matrix for logistic regression model on training data
caret::confusionMatrix(predict(vote_glm, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 48
## Voted 0 471
##
## Accuracy : 0.9104
## 95% CI : (0.883, 0.9332)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3836
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.00000
## Specificity : 0.90751
## Pos Pred Value : 0.26154
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.12127
## Balanced Accuracy : 0.95376
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on training data
caret::confusionMatrix(predict(vote_rf, trainSmall), trainSmall$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 17 0
## Voted 0 519
##
## Accuracy : 1
## 95% CI : (0.9931, 1)
## No Information Rate : 0.9683
## P-Value [Acc > NIR] : 3.143e-08
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.00000
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 1.00000
## Prevalence : 0.03172
## Detection Rate : 0.03172
## Detection Prevalence : 0.03172
## Balanced Accuracy : 1.00000
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for logistic regression model on testing data
caret::confusionMatrix(predict(vote_glm, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 14 166
## Voted 38 1119
##
## Accuracy : 0.8474
## 95% CI : (0.827, 0.8663)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0642
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.26923
## Specificity : 0.87082
## Pos Pred Value : 0.07778
## Neg Pred Value : 0.96716
## Prevalence : 0.03889
## Detection Rate : 0.01047
## Detection Prevalence : 0.13463
## Balanced Accuracy : 0.57002
##
## 'Positive' Class : Did not vote
##
# Confusion matrix for random forest model on testing data
caret::confusionMatrix(predict(vote_rf, testing), testing$turnout16_2016)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Did not vote Voted
## Did not vote 1 1
## Voted 51 1284
##
## Accuracy : 0.9611
## 95% CI : (0.9493, 0.9708)
## No Information Rate : 0.9611
## P-Value [Acc > NIR] : 0.5368
##
## Kappa : 0.0343
## Mcnemar's Test P-Value : 1.083e-11
##
## Sensitivity : 0.0192308
## Specificity : 0.9992218
## Pos Pred Value : 0.5000000
## Neg Pred Value : 0.9617978
## Prevalence : 0.0388930
## Detection Rate : 0.0007479
## Detection Prevalence : 0.0014959
## Balanced Accuracy : 0.5092263
##
## 'Positive' Class : Did not vote
##
Chapter 4 - Nuns
Catholic sisters survey from 1967 - https://curate.nd.edu/show/0r967368551 with codebook at https://curate.nd.edu/downloads/0v838051f6x
Exploratory data analysis with tidy data:
Predicting age with supervised learning:
Wrap up:
Example code includes:
sisters67 <- readr::read_csv("./RInputFiles/sisters.csv")
## Parsed with column specification:
## cols(
## .default = col_integer()
## )
## See spec(...) for full column specifications.
str(sisters67, give.attr = FALSE)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19278 obs. of 67 variables:
## $ age : int 40 30 40 30 40 30 70 30 60 80 ...
## $ sister: int 11545 16953 73323 75339 36303 95318 22474 114526 20707 91062 ...
## $ v116 : int 5 4 2 4 4 2 4 4 4 5 ...
## $ v117 : int 2 1 2 3 2 4 5 1 5 1 ...
## $ v118 : int 2 4 5 3 3 5 5 4 5 2 ...
## $ v119 : int 2 4 5 4 5 5 5 5 4 1 ...
## $ v120 : int 4 1 3 3 1 1 5 1 5 2 ...
## $ v121 : int 4 1 4 4 4 5 4 1 5 5 ...
## $ v122 : int 4 1 2 2 4 1 1 2 2 1 ...
## $ v123 : int 5 5 3 4 4 3 1 5 2 5 ...
## $ v124 : int 1 1 5 2 3 1 5 3 5 4 ...
## $ v125 : int 4 2 5 3 4 2 5 2 5 5 ...
## $ v126 : int 2 1 1 3 1 1 5 1 5 2 ...
## $ v127 : int 1 4 5 2 2 1 1 1 4 1 ...
## $ v128 : int 2 1 4 3 4 4 5 2 5 3 ...
## $ v129 : int 4 4 5 4 5 4 5 5 4 1 ...
## $ v130 : int 2 4 4 3 3 1 5 1 5 4 ...
## $ v131 : int 1 2 2 3 5 5 2 3 3 2 ...
## $ v132 : int 5 5 5 4 5 2 2 5 4 5 ...
## $ v133 : int 2 4 5 3 5 1 4 2 4 4 ...
## $ v134 : int 2 4 4 3 4 4 1 4 4 2 ...
## $ v135 : int 5 5 4 3 5 4 1 5 5 2 ...
## $ v136 : int 1 4 4 2 4 4 1 4 4 2 ...
## $ v137 : int 1 1 1 1 1 1 2 1 2 4 ...
## $ v138 : int 2 1 3 1 3 1 4 1 2 1 ...
## $ v139 : int 3 1 3 3 1 1 4 1 5 4 ...
## $ v140 : int 1 2 1 2 4 4 5 2 5 2 ...
## $ v141 : int 5 5 4 3 3 3 4 5 4 4 ...
## $ v142 : int 1 1 2 2 2 1 2 1 4 3 ...
## $ v143 : int 2 1 5 4 4 5 4 5 4 1 ...
## $ v144 : int 1 2 1 2 1 1 3 1 4 2 ...
## $ v145 : int 4 4 5 3 4 1 5 2 5 4 ...
## $ v146 : int 4 4 5 4 5 5 4 5 2 4 ...
## $ v147 : int 2 2 1 2 3 1 2 1 2 2 ...
## $ v148 : int 1 1 4 1 1 4 4 1 5 1 ...
## $ v149 : int 4 2 4 2 1 1 2 1 5 4 ...
## $ v150 : int 2 1 2 3 1 4 2 1 5 2 ...
## $ v151 : int 4 1 5 4 4 1 5 1 4 3 ...
## $ v152 : int 2 1 1 3 1 1 2 1 4 4 ...
## $ v153 : int 5 5 5 5 5 5 5 5 5 2 ...
## $ v154 : int 1 1 4 2 1 3 5 1 4 2 ...
## $ v155 : int 5 4 4 3 5 5 4 5 4 4 ...
## $ v156 : int 1 1 2 2 1 1 5 1 5 2 ...
## $ v157 : int 4 1 4 3 1 1 2 1 3 4 ...
## $ v158 : int 4 4 5 2 5 5 2 5 5 4 ...
## $ v159 : int 1 4 4 1 2 1 4 1 4 2 ...
## $ v160 : int 2 5 5 4 4 4 5 5 5 4 ...
## $ v161 : int 2 4 3 3 1 1 4 1 2 4 ...
## $ v162 : int 5 4 5 4 4 4 5 5 5 4 ...
## $ v163 : int 2 1 2 3 1 1 2 1 4 1 ...
## $ v164 : int 4 1 5 2 4 1 5 1 5 4 ...
## $ v165 : int 2 1 3 2 1 1 1 1 2 2 ...
## $ v166 : int 2 4 5 2 1 1 5 2 5 4 ...
## $ v167 : int 2 4 5 3 4 4 2 4 5 2 ...
## $ v168 : int 5 5 5 4 5 5 5 5 4 5 ...
## $ v169 : int 1 1 1 2 1 1 5 1 4 4 ...
## $ v170 : int 5 1 4 3 2 4 4 1 2 4 ...
## $ v171 : int 5 5 5 4 1 2 5 5 5 5 ...
## $ v172 : int 2 1 5 5 2 2 5 1 5 3 ...
## $ v173 : int 2 2 4 2 2 1 4 1 1 4 ...
## $ v174 : int 2 4 2 3 4 1 5 5 4 2 ...
## $ v175 : int 1 1 4 2 2 1 2 1 5 4 ...
## $ v176 : int 4 4 4 3 1 4 4 3 3 2 ...
## $ v177 : int 4 4 5 3 4 2 4 4 4 4 ...
## $ v178 : int 4 1 4 2 1 1 2 1 4 4 ...
## $ v179 : int 4 4 4 3 4 2 4 4 5 4 ...
## $ v180 : int 4 2 5 3 3 1 1 1 1 2 ...
# View sisters67
glimpse(sisters67)
## Observations: 19,278
## Variables: 67
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80,...
## $ sister <int> 11545, 16953, 73323, 75339, 36303, 95318, 22474, 114526...
## $ v116 <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4...
## $ v117 <int> 2, 1, 2, 3, 2, 4, 5, 1, 5, 1, 3, 2, 5, 4, 1, 1, 1, 1, 2...
## $ v118 <int> 2, 4, 5, 3, 3, 5, 5, 4, 5, 2, 4, 4, 4, 5, 2, 4, 4, 4, 2...
## $ v119 <int> 2, 4, 5, 4, 5, 5, 5, 5, 4, 1, 4, 5, 3, 4, 5, 5, 5, 5, 4...
## $ v120 <int> 4, 1, 3, 3, 1, 1, 5, 1, 5, 2, 3, 1, 5, 4, 4, 1, 1, 1, 2...
## $ v121 <int> 4, 1, 4, 4, 4, 5, 4, 1, 5, 5, 4, 1, 3, 4, 3, 2, 5, 3, 3...
## $ v122 <int> 4, 1, 2, 2, 4, 1, 1, 2, 2, 1, 4, 5, 1, 2, 4, 2, 1, 4, 2...
## $ v123 <int> 5, 5, 3, 4, 4, 3, 1, 5, 2, 5, 3, 4, 3, 4, 5, 5, 4, 5, 4...
## $ v124 <int> 1, 1, 5, 2, 3, 1, 5, 3, 5, 4, 4, 1, 3, 2, 1, 1, 3, 2, 2...
## $ v125 <int> 4, 2, 5, 3, 4, 2, 5, 2, 5, 5, 5, 5, 5, 5, 1, 1, 5, 1, 2...
## $ v126 <int> 2, 1, 1, 3, 1, 1, 5, 1, 5, 2, 4, 1, 5, 1, 3, 1, 5, 1, 2...
## $ v127 <int> 1, 4, 5, 2, 2, 1, 1, 1, 4, 1, 4, 1, 3, 5, 2, 1, 1, 2, 2...
## $ v128 <int> 2, 1, 4, 3, 4, 4, 5, 2, 5, 3, 2, 5, 5, 4, 1, 1, 4, 1, 1...
## $ v129 <int> 4, 4, 5, 4, 5, 4, 5, 5, 4, 1, 5, 1, 5, 5, 5, 1, 5, 5, 5...
## $ v130 <int> 2, 4, 4, 3, 3, 1, 5, 1, 5, 4, 5, 5, 1, 4, 1, 1, 4, 3, 2...
## $ v131 <int> 1, 2, 2, 3, 5, 5, 2, 3, 3, 2, 3, 4, 3, 4, 2, 4, 3, 4, 4...
## $ v132 <int> 5, 5, 5, 4, 5, 2, 2, 5, 4, 5, 4, 5, 5, 5, 4, 5, 3, 5, 5...
## $ v133 <int> 2, 4, 5, 3, 5, 1, 4, 2, 4, 4, 5, 1, 1, 1, 2, 4, 3, 1, 2...
## $ v134 <int> 2, 4, 4, 3, 4, 4, 1, 4, 4, 2, 3, 5, 2, 4, 4, 4, 3, 3, 4...
## $ v135 <int> 5, 5, 4, 3, 5, 4, 1, 5, 5, 2, 4, 5, 3, 5, 2, 5, 3, 5, 5...
## $ v136 <int> 1, 4, 4, 2, 4, 4, 1, 4, 4, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2...
## $ v137 <int> 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 5, 1, 3, 1, 1, 1, 1, 1, 1...
## $ v138 <int> 2, 1, 3, 1, 3, 1, 4, 1, 2, 1, 3, 2, 1, 3, 2, 1, 4, 3, 1...
## $ v139 <int> 3, 1, 3, 3, 1, 1, 4, 1, 5, 4, 4, 1, 2, 4, 1, 1, 2, 1, 1...
## $ v140 <int> 1, 2, 1, 2, 4, 4, 5, 2, 5, 2, 2, 1, 5, 2, 1, 4, 1, 2, 2...
## $ v141 <int> 5, 5, 4, 3, 3, 3, 4, 5, 4, 4, 5, 5, 5, 5, 5, 4, 4, 3, 5...
## $ v142 <int> 1, 1, 2, 2, 2, 1, 2, 1, 4, 3, 4, 2, 2, 3, 2, 2, 1, 3, 1...
## $ v143 <int> 2, 1, 5, 4, 4, 5, 4, 5, 4, 1, 4, 5, 5, 2, 5, 5, 3, 3, 5...
## $ v144 <int> 1, 2, 1, 2, 1, 1, 3, 1, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v145 <int> 4, 4, 5, 3, 4, 1, 5, 2, 5, 4, 4, 1, 4, 5, 2, 2, 1, 2, 2...
## $ v146 <int> 4, 4, 5, 4, 5, 5, 4, 5, 2, 4, 4, 4, 4, 4, 2, 5, 3, 5, 4...
## $ v147 <int> 2, 2, 1, 2, 3, 1, 2, 1, 2, 2, 3, 1, 2, 1, 2, 2, 3, 2, 4...
## $ v148 <int> 1, 1, 4, 1, 1, 4, 4, 1, 5, 1, 4, 1, 3, 1, 1, 1, 2, 1, 1...
## $ v149 <int> 4, 2, 4, 2, 1, 1, 2, 1, 5, 4, 4, 2, 5, 1, 1, 2, 5, 2, 1...
## $ v150 <int> 2, 1, 2, 3, 1, 4, 2, 1, 5, 2, 5, 2, 2, 2, 3, 1, 5, 1, 1...
## $ v151 <int> 4, 1, 5, 4, 4, 1, 5, 1, 4, 3, 4, 1, 2, 5, 2, 4, 5, 1, 4...
## $ v152 <int> 2, 1, 1, 3, 1, 1, 2, 1, 4, 4, 4, 1, 4, 3, 4, 1, 1, 1, 2...
## $ v153 <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 4, 5, 5, 5, 4...
## $ v154 <int> 1, 1, 4, 2, 1, 3, 5, 1, 4, 2, 5, 1, 5, 5, 1, 1, 4, 1, 1...
## $ v155 <int> 5, 4, 4, 3, 5, 5, 4, 5, 4, 4, 3, 4, 3, 5, 2, 5, 5, 5, 1...
## $ v156 <int> 1, 1, 2, 2, 1, 1, 5, 1, 5, 2, 5, 1, 1, 4, 1, 1, 3, 1, 1...
## $ v157 <int> 4, 1, 4, 3, 1, 1, 2, 1, 3, 4, 2, 1, 2, 3, 3, 2, 3, 1, 1...
## $ v158 <int> 4, 4, 5, 2, 5, 5, 2, 5, 5, 4, 4, 5, 4, 2, 5, 4, 4, 3, 4...
## $ v159 <int> 1, 4, 4, 1, 2, 1, 4, 1, 4, 2, 4, 1, 3, 2, 1, 1, 2, 1, 1...
## $ v160 <int> 2, 5, 5, 4, 4, 4, 5, 5, 5, 4, 5, 2, 5, 5, 5, 4, 5, 2, 4...
## $ v161 <int> 2, 4, 3, 3, 1, 1, 4, 1, 2, 4, 5, 1, 4, 5, 1, 1, 3, 1, 1...
## $ v162 <int> 5, 4, 5, 4, 4, 4, 5, 5, 5, 4, 4, 5, 5, 5, 3, 4, 5, 5, 5...
## $ v163 <int> 2, 1, 2, 3, 1, 1, 2, 1, 4, 1, 4, 1, 1, 1, 1, 2, 3, 3, 1...
## $ v164 <int> 4, 1, 5, 2, 4, 1, 5, 1, 5, 4, 4, 1, 1, 5, 1, 4, 3, 1, 4...
## $ v165 <int> 2, 1, 3, 2, 1, 1, 1, 1, 2, 2, 5, 2, 1, 5, 2, 3, 3, 2, 4...
## $ v166 <int> 2, 4, 5, 2, 1, 1, 5, 2, 5, 4, 5, 1, 2, 4, 2, 4, 5, 3, 4...
## $ v167 <int> 2, 4, 5, 3, 4, 4, 2, 4, 5, 2, 4, 4, 2, 5, 2, 4, 3, 2, 4...
## $ v168 <int> 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 5, 4, 5, 5, 3, 4, 3, 4, 5...
## $ v169 <int> 1, 1, 1, 2, 1, 1, 5, 1, 4, 4, 5, 1, 1, 1, 1, 1, 1, 1, 1...
## $ v170 <int> 5, 1, 4, 3, 2, 4, 4, 1, 2, 4, 3, 3, 3, 5, 4, 3, 5, 3, 4...
## $ v171 <int> 5, 5, 5, 4, 1, 2, 5, 5, 5, 5, 5, 1, 5, 5, 3, 4, 5, 4, 5...
## $ v172 <int> 2, 1, 5, 5, 2, 2, 5, 1, 5, 3, 5, 1, 5, 5, 2, 2, 3, 5, 2...
## $ v173 <int> 2, 2, 4, 2, 2, 1, 4, 1, 1, 4, 4, 1, 2, 5, 4, 4, 3, 1, 4...
## $ v174 <int> 2, 4, 2, 3, 4, 1, 5, 5, 4, 2, 4, 5, 3, 4, 2, 4, 3, 3, 4...
## $ v175 <int> 1, 1, 4, 2, 2, 1, 2, 1, 5, 4, 3, 1, 2, 4, 1, 4, 3, 1, 1...
## $ v176 <int> 4, 4, 4, 3, 1, 4, 4, 3, 3, 2, 5, 5, 3, 5, 3, 1, 3, 3, 2...
## $ v177 <int> 4, 4, 5, 3, 4, 2, 4, 4, 4, 4, 5, 2, 5, 5, 3, 2, 5, 4, 4...
## $ v178 <int> 4, 1, 4, 2, 1, 1, 2, 1, 4, 4, 4, 1, 2, 4, 1, 2, 3, 1, 2...
## $ v179 <int> 4, 4, 4, 3, 4, 2, 4, 4, 5, 4, 5, 2, 5, 5, 3, 1, 5, 3, 4...
## $ v180 <int> 4, 2, 5, 3, 3, 1, 1, 1, 1, 2, 4, 2, 2, 5, 1, 1, 3, 3, 2...
# Plot the histogram
ggplot(sisters67, aes(x = age)) +
geom_histogram(binwidth = 10)
# Tidy the data set
tidy_sisters <- sisters67 %>%
select(-sister) %>%
gather(key, value, -age)
# Print the structure of tidy_sisters
glimpse(tidy_sisters)
## Observations: 1,253,070
## Variables: 3
## $ age <int> 40, 30, 40, 30, 40, 30, 70, 30, 60, 80, 90, 40, 60, 80, ...
## $ key <chr> "v116", "v116", "v116", "v116", "v116", "v116", "v116", ...
## $ value <int> 5, 4, 2, 4, 4, 2, 4, 4, 4, 5, 2, 5, 4, 4, 3, 4, 5, 3, 4,...
# Overall agreement with all questions varied by age
tidy_sisters %>%
group_by(age) %>%
summarize(value = mean(value, na.rm = TRUE))
## # A tibble: 9 x 2
## age value
## <int> <dbl>
## 1 20 2.82
## 2 30 2.81
## 3 40 2.82
## 4 50 2.95
## 5 60 3.10
## 6 70 3.25
## 7 80 3.39
## 8 90 3.55
## 9 100 3.93
# Number of respondents agreed or disagreed overall
tidy_sisters %>%
count(value)
## # A tibble: 5 x 2
## value n
## <int> <int>
## 1 1 326386
## 2 2 211534
## 3 3 160961
## 4 4 277062
## 5 5 277127
# Visualize agreement with age
tidy_sisters %>%
filter(key %in% paste0("v", 153:170)) %>%
group_by(key, value) %>%
summarize(age = mean(age, na.rm = TRUE)) %>%
ggplot(aes(value, age, color = key)) +
geom_line(show.legend = FALSE) +
facet_wrap(~key, nrow = 3)
# Remove the sister column
sisters_select <- sisters67 %>%
select(-sister)
# Build a simple linear regression model
simple_lm <- lm(age ~ .,
data = sisters_select)
# Print the summary of the model
summary(simple_lm)
##
## Call:
## lm(formula = age ~ ., data = sisters_select)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.663 -9.586 -1.207 8.991 53.286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.59542 1.07173 25.748 < 2e-16 ***
## v116 -0.69014 0.07727 -8.931 < 2e-16 ***
## v117 -0.15914 0.08869 -1.794 0.072786 .
## v118 -0.74668 0.08473 -8.813 < 2e-16 ***
## v119 -0.35314 0.08321 -4.244 2.21e-05 ***
## v120 -0.13875 0.07513 -1.847 0.064813 .
## v121 0.04265 0.07794 0.547 0.584247
## v122 0.05237 0.08086 0.648 0.517208
## v123 -0.96372 0.09061 -10.636 < 2e-16 ***
## v124 0.44543 0.08681 5.131 2.91e-07 ***
## v125 0.50420 0.07425 6.791 1.15e-11 ***
## v126 0.44358 0.08579 5.170 2.36e-07 ***
## v127 -0.04781 0.07915 -0.604 0.545810
## v128 0.04459 0.07595 0.587 0.557162
## v129 0.03044 0.07881 0.386 0.699351
## v130 0.51028 0.08064 6.328 2.54e-10 ***
## v131 -0.54431 0.08417 -6.467 1.02e-10 ***
## v132 -0.02527 0.09337 -0.271 0.786703
## v133 -0.67041 0.07563 -8.864 < 2e-16 ***
## v134 -0.12144 0.09060 -1.340 0.180130
## v135 0.45773 0.10886 4.205 2.63e-05 ***
## v136 -0.08790 0.07438 -1.182 0.237293
## v137 0.74412 0.10230 7.274 3.63e-13 ***
## v138 0.31534 0.10601 2.974 0.002939 **
## v139 1.36585 0.10514 12.990 < 2e-16 ***
## v140 -0.73675 0.07371 -9.995 < 2e-16 ***
## v141 0.50515 0.09355 5.400 6.75e-08 ***
## v142 -0.22168 0.08357 -2.653 0.007992 **
## v143 0.08320 0.08375 0.993 0.320536
## v144 1.09413 0.10870 10.066 < 2e-16 ***
## v145 -0.46821 0.08217 -5.698 1.23e-08 ***
## v146 -0.50063 0.08094 -6.185 6.32e-10 ***
## v147 -0.28499 0.09800 -2.908 0.003640 **
## v148 1.47288 0.09165 16.070 < 2e-16 ***
## v149 -0.29683 0.08562 -3.467 0.000528 ***
## v150 -0.33882 0.08396 -4.036 5.46e-05 ***
## v151 0.79497 0.08901 8.931 < 2e-16 ***
## v152 -0.02073 0.08179 -0.253 0.799906
## v153 -0.53982 0.09110 -5.925 3.17e-09 ***
## v154 0.98930 0.07843 12.614 < 2e-16 ***
## v155 0.96066 0.09897 9.706 < 2e-16 ***
## v156 1.07836 0.09176 11.752 < 2e-16 ***
## v157 0.07577 0.08249 0.918 0.358378
## v158 0.05330 0.08419 0.633 0.526696
## v159 -0.28846 0.08321 -3.467 0.000528 ***
## v160 0.28066 0.08559 3.279 0.001043 **
## v161 0.67235 0.08759 7.677 1.71e-14 ***
## v162 -0.29388 0.10063 -2.920 0.003501 **
## v163 -1.38883 0.09242 -15.027 < 2e-16 ***
## v164 -0.44411 0.07017 -6.329 2.52e-10 ***
## v165 -0.49356 0.09033 -5.464 4.71e-08 ***
## v166 0.24787 0.08329 2.976 0.002924 **
## v167 -0.06290 0.08185 -0.768 0.442236
## v168 0.33712 0.09425 3.577 0.000349 ***
## v169 1.44938 0.08634 16.786 < 2e-16 ***
## v170 1.01626 0.09083 11.189 < 2e-16 ***
## v171 0.90086 0.08359 10.777 < 2e-16 ***
## v172 0.07702 0.07176 1.073 0.283135
## v173 0.76461 0.06936 11.025 < 2e-16 ***
## v174 0.22074 0.07851 2.812 0.004934 **
## v175 0.18369 0.07930 2.316 0.020553 *
## v176 1.03334 0.08996 11.487 < 2e-16 ***
## v177 -0.07908 0.09643 -0.820 0.412153
## v178 -0.08005 0.08250 -0.970 0.331906
## v179 0.29778 0.09251 3.219 0.001289 **
## v180 0.11524 0.08566 1.345 0.178538
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.13 on 19212 degrees of freedom
## Multiple R-squared: 0.3332, Adjusted R-squared: 0.3309
## F-statistic: 147.7 on 65 and 19212 DF, p-value: < 2.2e-16
# Split the data into training and validation/test sets
set.seed(1234)
in_train <- caret::createDataPartition(sisters_select$age, p = 0.6, list = FALSE)
training <- sisters_select[in_train, ]
validation_test <- sisters_select[-in_train, ]
# Split the validation and test sets
set.seed(1234)
in_test <- caret::createDataPartition(validation_test$age, p = 0.5, list = FALSE)
testing <- validation_test[in_test, ]
validation <- validation_test[-in_test, ]
# Fit a CART model
sisters_cart <- caret::train(age ~ ., method = "rpart", data = training)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
# Print the CART model
sisters_cart
## CART
##
## 11569 samples
## 65 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 11569, 11569, 11569, 11569, 11569, 11569, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.02304336 14.61359 0.1724244 12.00686
## 0.04935303 14.89119 0.1403800 12.41303
## 0.11481230 15.54485 0.1046127 13.19914
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.02304336.
inSmall <- sample(1:nrow(training), 500, replace=FALSE)
smallSisters <- training[sort(inSmall), ]
sisters_xgb <- caret::train(age ~ ., method = "xgbTree", data = smallSisters)
sisters_gbm <- caret::train(age ~ ., method = "gbm", data = smallSisters, verbose=FALSE)
# Make predictions on the three models
modeling_results <- validation %>%
mutate(CART = predict(sisters_cart, validation),
XGB = predict(sisters_xgb, validation),
GBM = predict(sisters_gbm, validation))
# View the predictions
modeling_results %>%
select(CART, XGB, GBM)
## # A tibble: 3,854 x 3
## CART XGB GBM
## <dbl> <dbl> <dbl>
## 1 49.5 46.2 44.3
## 2 49.5 61.1 56.5
## 3 58.0 59.9 65.6
## 4 58.0 60.0 61.9
## 5 58.0 71.6 74.6
## 6 49.5 50.9 53.4
## 7 49.5 58.6 55.0
## 8 49.5 42.2 38.0
## 9 41.3 41.7 38.2
## 10 58.0 51.6 50.0
## # ... with 3,844 more rows
# Compare performace
yardstick::metrics(modeling_results, truth = age, estimate = CART)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 14.6 0.163
yardstick::metrics(modeling_results, truth = age, estimate = XGB)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.5 0.287
yardstick::metrics(modeling_results, truth = age, estimate = GBM)
## # A tibble: 1 x 2
## rmse rsq
## <dbl> <dbl>
## 1 13.6 0.286
# Calculate RMSE
testing %>%
mutate(prediction = predict(sisters_gbm, testing)) %>%
yardstick::rmse(truth = age, estimate = prediction)
## [1] 13.87981
Chapter 1 - Introduction to Process Analysis
Introduction and overview:
Activities as cornerstones of processes:
Components of process data:
Example code includes:
# Load the processmapR package using library
library(processmapR)
##
## Attaching package: 'processmapR'
## The following object is masked from 'package:stats':
##
## frequency
library(bupaR)
## Loading required package: edeaR
## Loading required package: eventdataR
## Loading required package: xesreadR
##
## Attaching package: 'bupaR'
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:utils':
##
## timestamp
handling <- c('Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Registration', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Triage and Assessment', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'Blood test', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'MRI SCAN', 'X-Ray', 'X-Ray', 'X-Ray', 'X-Ray', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Discuss Results', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out', 'Check-out')
patient <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '156', '170', '172', '184', '278', '348', '420', '43', '156', '170', '172', '184', '278', '348', '420', '155', '221', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493')
employee <- c('r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r1', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r2', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r3', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r4', 'r5', 'r5', 'r5', 'r5', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r6', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7', 'r7')
handling_id <- c('43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720', '43', '155', '156', '170', '172', '184', '221', '278', '348', '420', '455', '493', '543', '655', '656', '670', '672', '684', '721', '778', '848', '920', '955', '993', '1020', '1072', '1081', '1082', '1088', '1127', '1163', '1199', '1257', '1309', '1318', '1319', '1325', '1364', '1400', '1436', '1557', '1587', '1710', '1730', '1777', '1889', '1890', '1904', '1906', '1918', '1955', '2012', '2082', '2154', '2189', '2227', '2272', '2384', '2385', '2399', '2401', '2413', '2450', '2507', '2577', '2649', '2684', '2720')
registration_type <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
rTime <- c('2017-02-19 04:38:51', '2017-06-03 10:05:28', '2017-06-03 10:05:28', '2017-06-17 15:10:30', '2017-06-17 23:00:33', '2017-06-27 07:48:22', '2017-08-03 17:05:27', '2017-09-26 20:22:49', '2017-11-24 08:28:44', '2018-02-08 03:39:21', '2018-03-14 21:04:28', '2018-04-29 04:55:10', '2017-02-19 07:28:53', '2017-06-04 06:27:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 18:29:13', '2017-06-28 00:14:50', '2017-08-04 07:22:06', '2017-09-27 22:57:03', '2017-11-24 10:33:00', '2018-02-08 17:33:12', '2018-03-15 15:12:41', '2018-04-30 19:40:22', '2017-02-20 19:59:18', '2017-06-04 15:18:50', '2017-06-18 22:51:07', '2017-06-21 02:43:27', '2017-07-01 23:55:10', '2017-09-28 22:58:23', '2017-11-25 12:06:18', '2018-02-12 09:01:38', '2017-02-21 06:49:49', '2017-06-04 23:23:28', '2017-06-19 06:44:30', '2017-06-21 11:16:30', '2017-07-02 11:16:08', '2017-09-29 07:28:10', '2017-11-25 21:54:56', '2018-02-12 19:43:42', '2017-06-05 00:12:24', '2017-08-05 08:25:17', '2018-03-17 10:30:24', '2018-05-02 07:32:45', '2017-02-21 14:50:43', '2017-06-05 14:03:19', '2017-06-05 10:26:16', '2017-06-19 22:46:10', '2017-06-22 04:39:35', '2017-07-03 01:28:49', '2017-08-05 22:06:23', '2017-09-29 19:13:51', '2017-11-26 06:52:23', '2018-02-17 02:44:58', '2018-03-18 00:20:51', '2018-05-02 18:14:11', '2017-02-24 14:58:43', '2017-06-05 15:58:53', '2017-06-05 15:58:53', '2017-06-20 03:48:37', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-08 23:17:45', '2017-09-29 21:16:01', '2017-11-27 04:56:53', '2018-02-20 09:49:29', '2018-03-18 08:12:07', '2018-05-03 00:11:10', '2017-02-19 07:28:53', '2017-06-03 14:19:00', '2017-06-03 13:23:14', '2017-06-17 16:31:58', '2017-06-18 01:07:42', '2017-06-27 12:22:51', '2017-08-03 19:25:12', '2017-09-26 22:17:18', '2017-11-24 10:33:00', '2018-02-08 06:01:38', '2018-03-15 00:34:01', '2018-04-29 07:39:14', '2017-02-19 21:58:08', '2017-06-04 14:23:26', '2017-06-04 06:27:00', '2017-06-18 04:14:55', '2017-06-19 00:40:19', '2017-06-28 12:48:20', '2017-08-04 21:09:17', '2017-09-28 12:00:12', '2017-11-25 00:44:30', '2018-02-09 07:05:52', '2018-03-16 04:08:03', '2018-05-01 10:37:51', '2017-02-21 03:12:26', '2017-06-04 19:35:51', '2017-06-19 03:01:11', '2017-06-21 08:02:20', '2017-07-02 07:43:48', '2017-09-29 04:58:49', '2017-11-25 18:30:43', '2018-02-12 13:57:13', '2017-02-21 09:57:05', '2017-06-05 02:46:59', '2017-06-19 11:40:53', '2017-06-21 16:09:26', '2017-07-02 16:03:16', '2017-09-29 12:44:39', '2017-11-26 02:40:30', '2018-02-12 23:53:46', '2017-06-05 04:39:38', '2017-08-05 13:56:39', '2018-03-17 14:09:40', '2018-05-02 12:24:41', '2017-02-21 17:57:58', '2017-06-05 15:58:53', '2017-06-05 14:03:19', '2017-06-20 01:44:29', '2017-06-22 08:40:55', '2017-07-03 03:39:51', '2017-08-05 23:53:27', '2017-09-29 21:16:01', '2017-11-26 09:44:37', '2018-02-17 06:17:57', '2018-03-18 03:22:17', '2018-05-02 21:17:12', '2017-02-24 16:03:49', '2017-06-05 17:22:16', '2017-06-05 17:15:30', '2017-06-20 05:36:40', '2017-06-22 10:59:58', '2017-07-03 05:00:48', '2017-08-09 00:13:39', '2017-09-29 23:42:48', '2017-11-27 06:53:23', '2018-02-20 12:04:00', '2018-03-18 10:48:34', '2018-05-03 02:11:42')
rOrder <- c(43, 155, 156, 170, 172, 184, 221, 278, 348, 420, 455, 493, 543, 655, 656, 670, 672, 684, 721, 778, 848, 920, 955, 993, 1020, 1072, 1081, 1082, 1088, 1127, 1163, 1199, 1257, 1309, 1318, 1319, 1325, 1364, 1400, 1436, 1557, 1587, 1710, 1730, 1777, 1889, 1890, 1904, 1906, 1918, 1955, 2012, 2082, 2154, 2189, 2227, 2272, 2384, 2385, 2399, 2401, 2413, 2450, 2507, 2577, 2649, 2684, 2720, 2764, 2876, 2877, 2891, 2893, 2905, 2942, 2999, 3069, 3141, 3176, 3214, 3264, 3376, 3377, 3391, 3393, 3405, 3442, 3499, 3569, 3641, 3676, 3714, 3741, 3793, 3802, 3803, 3809, 3848, 3884, 3920, 3978, 4030, 4039, 4040, 4046, 4085, 4121, 4157, 4278, 4308, 4431, 4451, 4498, 4610, 4611, 4625, 4627, 4639, 4676, 4733, 4803, 4875, 4910, 4948, 4993, 5105, 5106, 5120, 5122, 5134, 5171, 5228, 5298, 5370, 5405, 5441)
pFrame <- tibble(handling=factor(handling, levels=c('Blood test', 'Check-out', 'Discuss Results', 'MRI SCAN', 'Registration', 'Triage and Assessment', 'X-Ray')),
patient=patient,
employee=factor(employee, levels=c('r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7')),
handling_id=handling_id,
registration_type=factor(registration_type, levels=c("complete", "start")),
time=as.POSIXct(rTime),
.order=rOrder
)
patients <- eventlog(pFrame,
case_id = "patient",
activity_id = "handling",
activity_instance_id = "handling_id",
lifecycle_id = "registration_type",
timestamp = "time",
resource_id = "employee")
# The function slice can be used to take a slice of cases out of the eventdata. slice(1:10) will select the first ten cases in the event log, where first is defined by the current ordering of the data.
# How many patients are there?
n_cases(patients)
## [1] 12
# Print the summary of the data
summary(patients)
## Number of events: 136
## Number of cases: 12
## Number of traces: 2
## Number of distinct activities: 7
## Average trace length: 11.33333
##
## Start eventlog: 2017-02-19 04:38:51
## End eventlog: 2018-05-03 02:11:42
## handling patient employee handling_id
## Blood test :16 Length:136 r1:24 Length:136
## Check-out :24 Class :character r2:24 Class :character
## Discuss Results :24 Mode :character r3:16 Mode :character
## MRI SCAN :16 r4:16
## Registration :24 r5: 8
## Triage and Assessment:24 r6:24
## X-Ray : 8 r7:24
## registration_type time .order
## complete:68 Min. :2017-02-19 04:38:51 Min. : 1.00
## start :68 1st Qu.:2017-06-14 15:43:26 1st Qu.: 34.75
## Median :2017-07-03 03:39:51 Median : 68.50
## Mean :2017-09-06 11:31:32 Mean : 68.50
## 3rd Qu.:2017-11-26 14:32:41 3rd Qu.:102.25
## Max. :2018-05-03 02:11:42 Max. :136.00
##
# Show the journey of the first patient
slice(patients, 1)
## Event log consisting of:
## 12 events
## 1 traces
## 1 cases
## 6 activities
## 6 activity instances
##
## # A tibble: 12 x 7
## handling patient employee handling_id registration_type
## <fct> <chr> <fct> <chr> <fct>
## 1 Registration 43 r1 43 start
## 2 Triage and Assessment 43 r2 543 start
## 3 Blood test 43 r3 1020 start
## 4 MRI SCAN 43 r4 1257 start
## 5 Discuss Results 43 r6 1777 start
## 6 Check-out 43 r7 2272 start
## 7 Registration 43 r1 43 complete
## 8 Triage and Assessment 43 r2 543 complete
## 9 Blood test 43 r3 1020 complete
## 10 MRI SCAN 43 r4 1257 complete
## 11 Discuss Results 43 r6 1777 complete
## 12 Check-out 43 r7 2272 complete
## # ... with 2 more variables: time <dttm>, .order <int>
# How many distinct activities are there?
n_activities(patients)
## [1] 7
# What are the names of the activities?
activity_labels(patients)
## [1] Registration Triage and Assessment Blood test
## [4] MRI SCAN X-Ray Discuss Results
## [7] Check-out
## 7 Levels: Blood test Check-out Discuss Results MRI SCAN ... X-Ray
# Create a list of activities
activities(patients)
## # A tibble: 7 x 3
## handling absolute_frequency relative_frequency
## <fct> <int> <dbl>
## 1 Check-out 12 0.176
## 2 Discuss Results 12 0.176
## 3 Registration 12 0.176
## 4 Triage and Assessment 12 0.176
## 5 Blood test 8 0.118
## 6 MRI SCAN 8 0.118
## 7 X-Ray 4 0.0588
# Have a look at the different traces
traces(patients)
## # A tibble: 2 x 3
## trace absolute_frequen~ relative_freque~
## <chr> <int> <dbl>
## 1 Registration,Triage and Assessment,B~ 8 0.667
## 2 Registration,Triage and Assessment,X~ 4 0.333
# How many are there?
n_traces(patients)
## [1] 2
# Visualize the traces using trace_explorer
trace_explorer(patients, coverage=1)
# Draw process map
process_map(patients)
claims <- tibble(id=c("claim1", "claim1", "claim2", "claim2", "claim2"),
action=c(10002L, 10011L, 10015L, 10024L, 10024L),
action_type=c("Check Contract", "Pay Back Decision", "Check Contract", "Pay Back Decision", "Pay Back Decision"),
date=as.Date(c("2008-01-12", "2008-03-22", "2008-01-13", "2008-03-23", "2008-04-14")),
originator=c("Assistant 1", "Manager 2", "Assistant 6", "Manager 2", "Manager 2"),
status=as.factor(c("start", "start", "start", "start", "complete"))
)
claims
## # A tibble: 5 x 6
## id action action_type date originator status
## <chr> <int> <chr> <date> <chr> <fct>
## 1 claim1 10002 Check Contract 2008-01-12 Assistant 1 start
## 2 claim1 10011 Pay Back Decision 2008-03-22 Manager 2 start
## 3 claim2 10015 Check Contract 2008-01-13 Assistant 6 start
## 4 claim2 10024 Pay Back Decision 2008-03-23 Manager 2 start
## 5 claim2 10024 Pay Back Decision 2008-04-14 Manager 2 complete
#create eventlog claims_log
claims_log <- eventlog(claims,
case_id = "id",
activity_id = "action_type",
activity_instance_id = "action",
lifecycle_id = "status",
timestamp = "date",
resource_id = "originator")
# Print summary
summary(claims_log)
## Number of events: 5
## Number of cases: 2
## Number of traces: 1
## Number of distinct activities: 2
## Average trace length: 2.5
##
## Start eventlog: 2008-01-12
## End eventlog: 2008-04-14
## id action action_type
## Length:5 Length:5 Check Contract :2
## Class :character Class :character Pay Back Decision:3
## Mode :character Mode :character
##
##
##
## date originator status .order
## Min. :2008-01-12 Assistant 1:1 complete:1 Min. :1
## 1st Qu.:2008-01-13 Assistant 6:1 start :4 1st Qu.:2
## Median :2008-03-22 Manager 2 :3 Median :3
## Mean :2008-02-28 Mean :3
## 3rd Qu.:2008-03-23 3rd Qu.:4
## Max. :2008-04-14 Max. :5
# Check activity labels
activity_labels(claims_log)
## [1] Check Contract Pay Back Decision
## Levels: Check Contract Pay Back Decision
# Once you have an eventlog, you can access its complete metadata using the function mapping or the functions case_id, activity_id etc., to inspect individual identifiers.
Chapter 2 - Analysis Techniques
Organizational analysis:
Structuredness:
Performance analysis:
Linking perspectives:
Example code includes:
data(sepsis, package="eventdataR")
str(sepsis)
## Classes 'eventlog', 'tbl_df', 'tbl' and 'data.frame': 15214 obs. of 34 variables:
## $ case_id : chr "A" "A" "A" "A" ...
## $ activity : Factor w/ 16 levels "Admission IC",..: 4 10 3 9 6 5 8 7 2 3 ...
## $ lifecycle : Factor w/ 1 level "complete": 1 1 1 1 1 1 1 1 1 1 ...
## $ resource : Factor w/ 26 levels "?","A","B","C",..: 2 3 3 3 4 2 2 2 5 3 ...
## $ timestamp : POSIXct, format: "2014-10-22 11:15:41" "2014-10-22 11:27:00" ...
## $ age : int 85 NA NA NA NA NA NA NA NA NA ...
## $ crp : num NA NA 210 NA NA NA NA NA NA 1090 ...
## $ diagnose : chr "A" NA NA NA ...
## $ diagnosticartastrup : chr "true" NA NA NA ...
## $ diagnosticblood : chr "true" NA NA NA ...
## $ diagnosticecg : chr "true" NA NA NA ...
## $ diagnosticic : chr "true" NA NA NA ...
## $ diagnosticlacticacid : chr "true" NA NA NA ...
## $ diagnosticliquor : chr "false" NA NA NA ...
## $ diagnosticother : chr "false" NA NA NA ...
## $ diagnosticsputum : chr "false" NA NA NA ...
## $ diagnosticurinaryculture : chr "true" NA NA NA ...
## $ diagnosticurinarysediment: chr "true" NA NA NA ...
## $ diagnosticxthorax : chr "true" NA NA NA ...
## $ disfuncorg : chr "true" NA NA NA ...
## $ hypotensie : chr "true" NA NA NA ...
## $ hypoxie : chr "false" NA NA NA ...
## $ infectionsuspected : chr "true" NA NA NA ...
## $ infusion : chr "true" NA NA NA ...
## $ lacticacid : chr NA NA NA "2.2" ...
## $ leucocytes : chr NA "9.6" NA NA ...
## $ oligurie : chr "false" NA NA NA ...
## $ sirscritheartrate : chr "true" NA NA NA ...
## $ sirscritleucos : chr "false" NA NA NA ...
## $ sirscrittachypnea : chr "true" NA NA NA ...
## $ sirscrittemperature : chr "true" NA NA NA ...
## $ sirscriteria2ormore : chr "true" NA NA NA ...
## $ activity_instance_id : chr "1" "2" "3" "4" ...
## $ .order : int 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "case_id")= chr "case_id"
## - attr(*, "activity_id")= chr "activity"
## - attr(*, "activity_instance_id")= chr "activity_instance_id"
## - attr(*, "lifecycle_id")= chr "lifecycle"
## - attr(*, "resource_id")= chr "resource"
## - attr(*, "timestamp")= chr "timestamp"
# Print list of resources
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Number of resources per activity
resource_frequency(sepsis, level = "activity")
## # A tibble: 16 x 11
## activity nr_of_resources min q1 mean median q3 max st_dev
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 Admissi~ 4 1 7.00e0 2.92e1 3.10e1 5.32e1 54 28.2
## 2 Admissi~ 20 1 1.70e1 5.91e1 4.05e1 6.82e1 216 62.7
## 3 CRP 1 3262 3.26e3 3.26e3 3.26e3 3.26e3 3262 NA
## 4 ER Regi~ 2 65 2.95e2 5.25e2 5.25e2 7.55e2 985 651
## 5 ER Seps~ 2 65 2.95e2 5.24e2 5.24e2 7.54e2 984 650
## 6 ER Tria~ 1 1053 1.05e3 1.05e3 1.05e3 1.05e3 1053 NA
## 7 IV Anti~ 2 45 2.28e2 4.12e2 4.12e2 5.95e2 778 518
## 8 IV Liqu~ 2 38 2.07e2 3.76e2 3.76e2 5.46e2 715 479
## 9 LacticA~ 1 1466 1.47e3 1.47e3 1.47e3 1.47e3 1466 NA
## 10 Leucocy~ 1 3383 3.38e3 3.38e3 3.38e3 3.38e3 3383 NA
## 11 Release~ 1 671 6.71e2 6.71e2 6.71e2 6.71e2 671 NA
## 12 Release~ 1 56 5.60e1 5.60e1 5.60e1 5.60e1 56 NA
## 13 Release~ 1 25 2.50e1 2.50e1 2.50e1 2.50e1 25 NA
## 14 Release~ 1 24 2.40e1 2.40e1 2.40e1 2.40e1 24 NA
## 15 Release~ 1 6 6.00e0 6.00e0 6.00e0 6.00e0 6 NA
## 16 Return ~ 1 294 2.94e2 2.94e2 2.94e2 2.94e2 294 NA
## # ... with 2 more variables: iqr <dbl>, total <int>
# Plot Number of executions per resource-activity
resource_frequency(sepsis, level = "resource-activity") %>% plot()
# Calculate resource involvement
resource_involvement(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 C 1050 1.00
## 2 B 1013 0.965
## 3 A 985 0.938
## 4 E 782 0.745
## 5 ? 294 0.280
## 6 F 200 0.190
## 7 O 179 0.170
## 8 G 147 0.140
## 9 I 118 0.112
## 10 M 82 0.0781
## # ... with 16 more rows
# Show graphically
sepsis %>% resource_involvement(level = "resource") %>% plot
# Compare with resource frequency
resource_frequency(sepsis, level="resource")
## # A tibble: 26 x 3
## resource absolute relative
## <fct> <int> <dbl>
## 1 B 8111 0.533
## 2 A 3462 0.228
## 3 C 1053 0.0692
## 4 E 782 0.0514
## 5 ? 294 0.0193
## 6 F 216 0.0142
## 7 L 213 0.0140
## 8 O 186 0.0122
## 9 G 148 0.00973
## 10 I 126 0.00828
## # ... with 16 more rows
# Min, max and average number of repetitions
sepsis %>% number_of_repetitions(level = "log")
## Using default type: all
## min q1 median mean q3 max st_dev iqr
## 0.000000 0.000000 2.000000 1.640000 3.000000 5.000000 1.280461 3.000000
## attr(,"type")
## [1] "all"
# Plot repetitions per activity
sepsis %>% number_of_repetitions(level = "activity") %>% plot
## Using default type: all
# Number of repetitions per resources
sepsis %>% number_of_repetitions(level = "resource")
## Using default type: all
## # resource_metric [26 x 3]
## first_resource absolute relative
## <fct> <dbl> <dbl>
## 1 ? 0 0
## 2 A 0 0
## 3 B 1536 0.189
## 4 C 3.00 0.00285
## 5 D 0 0
## 6 E 0 0
## 7 F 16.0 0.0741
## 8 G 67.0 0.453
## 9 H 6.00 0.109
## 10 I 12.0 0.0952
## # ... with 16 more rows
eci <- c('21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122', '21', '21', '21', '21', '21', '21', '21', '21', '21', '31', '31', '31', '31', '31', '31', '31', '31', '31', '31', '41', '41', '41', '41', '41', '41', '41', '51', '51', '51', '51', '51', '51', '51', '61', '61', '61', '61', '61', '61', '91', '91', '91', '91', '91', '91', '101', '101', '101', '101', '101', '101', '111', '111', '111', '111', '121', '121', '121', '121', '121', '121', '121', '121', '121', '131', '131', '131', '131', '131', '131', '131', '131', '161', '161', '171', '171', '171', '171', '181', '181', '181', '181', '181', '181', '201', '201', '201', '201', '201', '201', '201', '12', '12', '12', '12', '12', '22', '22', '22', '22', '22', '22', '32', '32', '32', '32', '32', '32', '42', '42', '42', '42', '52', '52', '52', '52', '52', '82', '82', '82', '82', '82', '92', '92', '92', '92', '92', '102', '102', '102', '102', '102', '112', '112', '122', '122')
ea1 <- c('prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast')
ea2 <- c('eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareDinner', 'eatingDinner', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'prepareBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'snack', 'eatingBreakfast', 'prepareBreakfast', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'snack', 'snack', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareDinner', 'eatingDinner', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'snack', 'prepareLunch', 'eatingLunch', 'snack', 'eatingLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareLunch', 'eatingLunch', 'snack', 'prepareBreakfast', 'eatingBreakfast', 'prepareBreakfast', 'eatingBreakfast')
eaii <- c('9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683', '9', '10', '19', '23', '24', '26', '36', '40', '41', '51', '52', '58', '60', '62', '63', '67', '69', '72', '73', '86', '87', '89', '90', '104', '105', '107', '119', '120', '128', '132', '133', '138', '139', '149', '150', '156', '159', '160', '164', '174', '175', '192', '194', '195', '198', '205', '206', '208', '211', '213', '214', '229', '236', '237', '239', '245', '251', '252', '253', '255', '259', '260', '262', '264', '271', '276', '281', '287', '292', '293', '297', '299', '310', '312', '331', '332', '336', '347', '363', '364', '374', '376', '387', '389', '434', '435', '447', '448', '450', '453', '454', '462', '463', '471', '472', '475', '483', '484', '487', '491', '492', '496', '508', '509', '512', '517', '518', '522', '536', '540', '541', '543', '562', '563', '565', '566', '572', '584', '585', '589', '590', '598', '615', '616', '618', '619', '627', '639', '640', '642', '643', '653', '665', '666', '682', '683')
elci <- c('start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'start', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete', 'complete')
ets1 <- c('2012-11-12 09:42:02', '2012-11-12 09:52:33', '2012-11-12 11:05:44', '2012-11-12 13:45:49', '2012-11-12 13:48:49', '2012-11-12 15:23:00', '2012-11-12 18:47:29', '2012-11-12 22:35:21', '2012-11-12 22:35:21', '2012-11-13 08:56:37', '2012-11-13 09:04:54', '2012-11-13 10:14:04', '2012-11-13 13:47:45', '2012-11-13 14:08:24', '2012-11-13 14:19:01', '2012-11-13 17:34:23', '2012-11-13 18:51:51', '2012-11-13 23:05:07', '2012-11-13 23:17:07', '2012-11-14 09:06:08', '2012-11-14 09:17:48', '2012-11-14 10:38:16', '2012-11-14 10:44:16', '2012-11-14 21:30:09', '2012-11-14 21:37:09', '2012-11-14 22:14:23', '2012-11-15 09:37:15', '2012-11-15 09:47:12', '2012-11-15 10:11:08', '2012-11-15 14:35:27', '2012-11-15 14:41:27', '2012-11-15 22:07:26', '2012-11-15 22:26:02', '2012-11-16 10:39:14', '2012-11-16 10:52:56', '2012-11-16 12:09:10', '2012-11-16 14:13:00', '2012-11-16 14:19:00', '2012-11-16 18:11:36', '2012-11-19 10:13:23', '2012-11-19 10:25:00', '2012-11-19 15:55:22', '2012-11-19 21:47:27', '2012-11-19 21:59:27', '2012-11-19 22:31:06', '2012-11-20 10:20:00', '2012-11-20 10:21:02', '2012-11-20 11:00:16', '2012-11-20 13:03:28', '2012-11-20 14:25:11', '2012-11-20 14:41:22', '2012-11-21 10:01:00', '2012-11-21 15:02:08', '2012-11-21 15:15:08', '2012-11-21 17:50:29', '2012-11-22 01:40:42', '2012-11-22 10:19:15', '2012-11-22 10:26:15', '2012-11-22 11:02:27', '2012-11-22 11:56:06', '2012-11-22 15:05:51', '2012-11-22 15:12:55', '2012-11-22 16:43:08', '2012-11-22 18:15:32', '2012-11-23 00:36:00', '2012-11-23 01:03:00', '2012-11-23 09:49:00', '2012-11-23 12:53:06', '2012-11-23 14:01:08', '2012-11-23 14:23:08', '2012-11-23 16:57:24', '2012-11-23 17:58:00', '2012-11-26 09:06:12', '2012-11-26 09:57:12', '2012-11-27 10:20:26', '2012-11-27 10:30:50')
ets2 <- c('2012-11-27 11:54:15', '2012-11-27 19:46:15', '2012-11-28 09:27:15', '2012-11-28 09:34:15', '2012-11-28 12:28:02', '2012-11-28 13:16:33', '2012-11-28 19:30:08', '2012-11-28 22:15:02', '2012-11-30 10:43:19', '2012-11-30 10:46:19', '2012-11-30 14:51:36', '2012-11-30 15:08:36', '2012-11-30 17:30:40', '2012-11-30 22:12:05', '2012-11-30 22:16:07', '2011-11-28 10:38:00', '2011-11-28 10:43:00', '2011-11-28 14:31:06', '2011-11-28 14:42:00', '2011-11-28 20:20:55', '2011-11-29 12:09:09', '2011-11-29 12:11:01', '2011-11-29 13:25:29', '2011-11-29 15:15:14', '2011-11-29 15:23:00', '2011-11-29 16:32:20', '2011-11-30 10:23:46', '2011-11-30 10:28:46', '2011-11-30 13:05:27', '2011-11-30 14:39:42', '2011-11-30 14:56:00', '2011-11-30 16:41:05', '2011-11-30 14:37:00', '2011-12-01 11:17:05', '2011-12-01 11:20:05', '2011-12-01 14:29:37', '2011-12-02 12:29:08', '2011-12-02 12:32:08', '2011-12-02 14:47:18', '2011-12-02 14:51:00', '2011-12-02 19:40:44', '2011-12-05 12:15:45', '2011-12-05 12:18:05', '2011-12-05 15:00:55', '2011-12-05 15:14:00', '2011-12-05 19:24:11', '2011-12-06 11:30:19', '2011-12-06 11:33:02', '2011-12-06 14:41:16', '2011-12-06 14:56:00', '2011-12-06 19:22:50', '2011-12-07 11:12:17', '2011-12-07 11:17:22', '2011-12-07 14:04:32', '2011-12-07 14:14:00', '2011-12-07 19:23:55', '2011-12-08 11:25:12', '2011-12-08 11:29:01', '2011-12-09 11:00:13', '2011-12-09 11:03:33', '2012-11-12 09:50:02', '2012-11-12 09:55:29', '2012-11-12 12:39:42', '2012-11-12 14:48:14', '2012-11-12 14:53:14', '2012-11-12 15:31:53', '2012-11-12 19:00:56', '2012-11-12 22:37:55', '2012-11-12 22:40:55', '2012-11-13 09:00:26', '2012-11-13 09:10:12', '2012-11-13 10:51:55', '2012-11-13 14:03:31', '2012-11-13 14:18:36', '2012-11-13 14:42:36', '2012-11-13 17:36:34', '2012-11-13 19:45:03', '2012-11-13 23:15:33', '2012-11-13 23:37:33', '2012-11-14 09:09:41', '2012-11-14 09:21:43', '2012-11-14 11:43:23', '2012-11-14 11:06:23', '2012-11-14 21:35:17', '2012-11-14 21:47:18', '2012-11-14 22:17:47', '2012-11-15 09:44:06', '2012-11-15 09:48:08', '2012-11-15 10:23:49', '2012-11-15 15:40:32', '2012-11-15 15:46:32', '2012-11-15 22:22:44', '2012-11-15 22:31:00', '2012-11-16 10:42:13')
ets3 <- c('2012-11-16 10:52:58', '2012-11-16 12:09:57', '2012-11-16 14:58:55', '2012-11-16 14:55:55', '2012-11-16 18:14:49', '2012-11-19 10:17:12', '2012-11-19 10:33:59', '2012-11-19 16:07:49', '2012-11-19 21:59:01', '2012-11-19 22:24:58', '2012-11-19 22:31:59', '2012-11-20 10:21:02', '2012-11-20 10:37:51', '2012-11-20 11:14:44', '2012-11-20 13:28:35', '2012-11-20 14:40:16', '2012-11-20 15:10:16', '2012-11-21 10:06:50', '2012-11-21 15:14:47', '2012-11-21 15:30:55', '2012-11-21 17:55:48', '2012-11-22 01:45:42', '2012-11-22 10:25:45', '2012-11-22 10:59:45', '2012-11-22 11:10:30', '2012-11-22 12:09:07', '2012-11-22 15:12:19', '2012-11-22 15:26:18', '2012-11-22 16:51:54', '2012-11-22 18:17:25', '2012-11-23 00:41:13', '2012-11-23 10:28:57', '2012-11-23 10:01:57', '2012-11-23 12:57:33', '2012-11-23 14:20:47', '2012-11-23 14:38:47', '2012-11-23 16:57:43', '2012-11-23 18:06:38', '2012-11-26 10:37:28', '2012-11-26 10:05:28', '2012-11-27 10:30:43', '2012-11-27 10:44:43', '2012-11-27 11:54:59', '2012-11-27 19:46:56', '2012-11-28 09:33:52', '2012-11-28 09:44:52', '2012-11-28 12:57:42', '2012-11-28 13:38:45', '2012-11-28 19:45:20', '2012-11-28 22:18:43', '2012-11-30 11:45:40', '2012-11-30 11:51:40', '2012-11-30 15:05:54', '2012-11-30 15:20:00', '2012-11-30 17:42:59', '2012-11-30 22:15:48', '2012-11-30 22:39:48', '2011-11-28 10:42:55', '2011-11-28 10:49:00', '2011-11-28 14:41:54', '2011-11-28 15:04:00', '2011-11-28 20:20:59', '2011-11-29 12:10:37', '2011-11-29 12:19:00', '2011-11-29 13:25:32', '2011-11-29 15:22:57', '2011-11-29 15:49:00', '2011-11-29 16:32:23', '2011-11-30 10:27:58', '2011-11-30 10:38:58', '2011-11-30 13:05:31', '2011-11-30 14:55:24', '2011-11-30 15:11:00', '2011-11-30 16:41:09', '2011-11-30 15:08:00', '2011-12-01 11:19:43', '2011-12-01 11:29:43', '2011-12-01 14:36:38', '2011-12-02 12:31:10', '2011-12-02 12:37:10', '2011-12-02 14:50:19', '2011-12-02 15:24:00', '2011-12-02 19:40:50', '2011-12-05 12:17:58', '2011-12-05 12:26:02', '2011-12-05 15:13:55', '2011-12-05 15:42:00', '2011-12-05 19:24:16', '2011-12-06 11:32:49', '2011-12-06 11:38:51', '2011-12-06 14:55:18', '2011-12-06 15:18:18', '2011-12-06 19:22:55', '2011-12-07 11:17:14', '2011-12-07 11:22:35', '2011-12-07 14:13:34', '2011-12-07 14:41:00', '2011-12-07 20:38:18', '2011-12-08 11:28:24', '2011-12-08 11:35:55', '2011-12-09 11:03:09', '2011-12-09 11:09:08')
etsF <- c(ets1, ets2, ets3)
eatData <- tibble(case_id=eci,
activity=factor(c(ea1, ea2)),
activity_instance_id=eaii,
lifecycle_id=factor(elci),
resource=factor("UNDEFINED"),
timestamp=as.POSIXct(etsF)
)
eat_patterns <- eventlog(eatData,
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "activity_instance_id",
lifecycle_id = "lifecycle_id",
timestamp = "timestamp",
resource_id = "resource")
# Create performance map
eat_patterns %>% process_map(type = performance(FUN = median, units = "hours"))
# Inspect variation in activity durations graphically
eat_patterns %>% processing_time(level = "activity") %>% plot()
# Draw dotted chart
eat_patterns %>% dotted_chart(x = "relative_day", sort = "start_day", units = "secs")
# Time per activity
# daily_activities %>% processing_time(level = "activity") %>% plot
# Average duration of recordings
# daily_activities %>% throughput_time(level="log", units = "hours")
# Missing activities
# daily_activities %>% idle_time(level="log", units = "hours")
# Distribution throughput time
# vacancies %>% throughput_time(units="days")
# Distribution throughput time per department
# vacancies %>% group_by(vacancy_department) %>% throughput_time(units="days") %>% plot()
# Repetitions of activities
# vacancies %>% number_of_repetitions(level = "activity") %>% arrange(-relative)
Chapter 3 - Event Data Processing
Filtering cases:
Filtering events - trim, frequency, label, general attribute:
Aggregating events - Is-A and Part-of:
Enriching events - mutation (adding calculated variables):
Example code includes:
# Select top 20% of cases according to trace frequency
happy_path <- filter_trace_frequency(vacancies, percentage = 0.2)
# Visualize using process map
happy_path %>% process_map(type=requency(value = "absolute_case"))
# Compute throughput time
happy_path %>% throughput_time(units="days")
# Find no_declines
no_declines <- filter_activity_presence(vacancies, activities = "Decline Candidate", reverse=TRUE)
# What is the average number of
first_hit <- filter_activity_presence(vacancies, activities = c("Send Offer", "Offer Accepted"), method="all")
# Create a performance map
first_hit %>% process_map(type=performance())
# Compute throughput time
first_hit %>% throughput_time()
# Create not_refused
not_refused <- vacancies %>% filter_precedence(antecedents = "Receive Response", consequents = "Review Non Acceptance", precedence_type = "directly_follows", filter_method = "none")
# Select longest_cases
worst_cases <- not_refused %>% filter_throughput_time(interval=c(300, NA))
# Show the different traces
worst_cases %>% trace_explorer(coverage=1)
# Select activities
disapprovals <- vacancies %>% filter_activity(activities=c("Construct Offer", "Disapprove Offer", "Revise Offer","Disapprove Revision", "Restart Procedure"))
# Explore traces
disapprovals %>% trace_explorer(coverage=0.8)
# Performance map
disapprovals %>% process_map(type = performance(FUN = sum, units = "weeks"))
# Select cases
high_paid <- vacancies %>% filter(vacancy_department=="R&D", vacancy_salary_range==">100000")
# Most active resources
high_paid %>% resource_frequency(level="resource")
# Create a dotted chart
high_paid %>% dotted_chart(x="absolute", sort="start")
# Filtered dotted chart
library(lubridate)
high_paid %>% filter_time_period(interval = ymd(c("20180321","20180620")), filter_method = "trim") %>% dotted_chart(x="absolute", sort="start")
# Count activities and instances
n_activities(vacancies)
n_activity_instances(vacancies)
# Combine activities
united_vacancies <- vacancies %>%
act_unite("Disapprove Contract Offer" = c("Disapprove Offer","Disapprove Revision"),
"Approve Contract Offer" = c("Approve Offer","Approve Revision"),
"Construct Contract Offer" = c("Construct Offer","Revise Offer")
)
# Count activities and instances
n_activities(united_vacancies)
n_activity_instances(united_vacancies)
# Aggregate sub processes
aggregated_vacancies <- act_collapse(united_vacancies,
"Interviews" = c("First Interview","Second Interview","Third Interview"),
"Prepare Recruitment" = c("Publish Position","File Applications","Check References"),
"Create Offer" = c("Construct Contract Offer", "Disapprove Contract Offer", "Approve Contract Offer")
)
# Calculated number of activities and activity instances
n_activities(aggregated_vacancies)
n_activity_instances(aggregated_vacancies)
# Create performance map
aggregated_vacancies %>% process_map(type=performance())
# Add total_cost
vacancies_cost <- vacancies %>%
group_by_case() %>%
mutate(total_cost = sum(activity_cost, na.rm = TRUE))
# Add cost_impact
vacancies_impact <- vacancies_cost %>%
# Compute throughput time per impact
vacancies_impact %>% group_by(cost_impact) %>% throughput_time(units = "weeks") %>% plot()
# Create cost_profile
vacancies_profile <- vacancies_impact %>%
mutate(cost_profile = case_when(cost_impact == "High" & urgency < 7 ~ "Disproportionate",
cost_impact == "Medium" & urgency < 5 ~ "Excessive",
cost_impact == "Low" & urgency > 6 ~ "Lacking",
TRUE ~ "Appropriate"))
# Compare number of cases
vacancies_profile %>%
group_by(cost_profile) %>%
n_cases()
# Explore lacking traces
vacancies_profile %>%
filter(cost_profile == "Lacking") %>%
process_map()
Chapter 4 - Case Study
Preparing the event data - example includes data from Sales, Purchasing, Manufacturing, Packaging & Delivery, Accounting:
Getting to know the process:
Roles and rules:
Fast production, fast delivery:
Course recap:
Example code includes:
quotations <- readRDS("./RInputFiles/otc_quotations.RDS")
# Inspect quotations
str(quotations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 17 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ cancelled_at : chr "2017-05-22 13:28:04" NA NA NA ...
## $ cancelled_by : Factor w/ 20 levels "Amy","Andrea",..: 10 NA NA NA 8 NA NA NA NA NA ...
## $ manufactContacted_at : chr "2017-04-22 17:58:11" "2017-06-18 13:47:50" "2017-10-28 13:55:51" NA ...
## $ manufactContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA NA NA 11 14 NA NA ...
## $ received_at : chr "2017-04-16 20:34:12" "2017-06-09 11:19:31" "2017-10-14 18:55:47" "2017-09-08 13:29:05" ...
## $ received_by : Factor w/ 20 levels "Amy","Andrea",..: 2 8 8 8 8 8 10 8 2 2 ...
## $ reminded_at : chr "2017-05-14 19:06:41" NA NA NA ...
## $ reminded_by : Factor w/ 20 levels "Amy","Andrea",..: 8 NA NA NA 8 NA 8 8 NA NA ...
## $ send_at : chr "2017-05-08 14:20:30" "2017-07-02 18:50:58" "2017-11-09 11:27:11" NA ...
## $ send_by : Factor w/ 20 levels "Amy","Andrea",..: 10 2 2 NA 2 NA 2 2 NA 2 ...
## $ supplierContacted_at : chr "2017-04-29 13:43:18" "2017-06-20 12:19:31" "2017-10-26 18:06:29" NA ...
## $ supplierContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 14 11 11 NA 11 NA 11 14 NA 14 ...
## $ supplierOfferReceived_at: chr "2017-05-03 19:09:21" "2017-06-23 19:33:10" "2017-10-30 10:36:44" NA ...
## $ supplierOfferReceived_by: Factor w/ 20 levels "Amy","Andrea",..: 14 11 14 NA 14 NA 14 14 NA 14 ...
## $ warehouseContacted_at : chr "2017-04-24 19:36:10" "2017-06-15 19:30:07" "2017-10-22 17:57:26" NA ...
## $ warehouseContacted_by : Factor w/ 20 levels "Amy","Andrea",..: 11 11 11 NA 14 NA 11 14 NA 14 ...
# Create offer_history
offer_history <- quotations %>%
gather(key, value, -quotation_id) %>%
separate(key, into = c("activity", "info"))
## Warning: attributes are not identical across measure variables;
## they will be dropped
# Recode the key variable
offer_history <- offer_history %>%
mutate(info = fct_recode(info, "timestamp" = 'at', "resource" = 'by'))
# Spread the info variable
offer_history <- offer_history %>%
spread(info, value)
validations <- readRDS("./RInputFiles/otc_validations.RDS")
# Inspect validations
str(validations)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1833 obs. of 4 variables:
## $ quotation_id: chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ started : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ completed : chr "2017-04-19 18:32:57" "2017-06-13 12:18:57" "2017-10-18 16:21:56" "2017-09-12 20:58:14" ...
# Create validate_history
validate_history <- validations %>%
mutate(
activity = "Validate",
action = paste(quotation_id, "validate", sep = "-"))
# Gather the timestamp columns
validate_history <- validate_history %>%
gather(lifecycle, timestamp, started, completed)
# Recode the lifecycle column of validate_history
validate_history <- validate_history %>%
mutate(lifecycle = fct_recode(lifecycle,
"start" = "started",
"complete" = "completed"))
# Add lifecycle and action column to offer_history
offer_history <- offer_history %>%
mutate(
lifecycle = "complete",
action = paste(quotation_id, 1:n(), sep = "-"))
# Create sales_history
sales_history <- bind_rows(validate_history, offer_history)
## Warning in bind_rows_(x, .id): binding factor and character vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
sales_history <- readRDS("./RInputFiles/otc_sales_history.RDS")
order_history <- readRDS("./RInputFiles/otc_order_history.RDS")
# sales_quotations <- readRDS("./RInputFiles/otc_sales_quotation.RDS")
str(sales_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 14695 obs. of 7 variables:
## $ quotation_id : chr "quo-1003" "quo-1004" "quo-1006" "quo-1008" ...
## $ resource : chr "Jonathan" "Andrea" "Katherine" "Andrea" ...
## $ activity : chr "Validate" "Validate" "Validate" "Validate" ...
## $ action : chr "quo-1003-validate" "quo-1004-validate" "quo-1006-validate" "quo-1008-validate" ...
## $ lifecycle : chr "start" "start" "start" "start" ...
## $ timestamp : chr "2017-04-17 14:59:08" "2017-06-11 13:10:45" "2017-10-16 15:59:18" "2017-09-09 17:58:39" ...
## $ sales_order_id: chr NA "order-17-56548" "order-17-56550" NA ...
str(order_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60804 obs. of 8 variables:
## $ sales_order_id: chr "order-17-56542" "order-17-56542" "order-17-56543" "order-17-56543" ...
## $ action : chr "order-17-56542-0000001" "order-17-56542-0000002" "order-17-56543-0000003" "order-17-56543-0000004" ...
## $ activity : Factor w/ 37 levels "Assemble Order",..: 24 35 24 35 24 35 24 35 24 35 ...
## $ resource : Factor w/ 20 levels "Amy","Andrea",..: 10 8 2 8 2 8 10 8 2 8 ...
## $ status : Factor w/ 2 levels "complete","start": 2 2 2 2 2 2 2 2 2 2 ...
## $ time : POSIXct, format: "2017-10-17 12:37:22" "2017-10-19 15:30:40" ...
## $ activity_cost : num NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
## $ quotation_id : chr NA NA NA NA ...
# str(sales_quotations)
order_history <- order_history %>%
rename(timestamp=time, lifecycle=status) %>%
select(-activity_cost) %>%
mutate(activity=as.character(activity),
resource=as.character(activity),
lifecycle=as.character(lifecycle)
)
sales_history <- sales_history %>%
mutate(timestamp=lubridate::as_datetime(timestamp))
# sales_history <- sales_history %>% left_join(sales_quotations)
otc <- bind_rows(sales_history, order_history)
# Create the eventlog object
otc <- otc %>%
mutate(case_id = paste(quotation_id, sales_order_id, sep = "-")) %>%
eventlog(
case_id = "case_id",
activity_id = "activity",
activity_instance_id = "action",
timestamp = "timestamp",
resource_id = "resource",
lifecycle_id = "lifecycle"
)
# Create trace coverage graph
trace_coverage(otc, level="trace") %>% plot()
# Explore traces
otc %>%
trace_explorer(coverage = 0.25)
# Collapse activities
otc_high_level <- act_collapse(otc, "Delivery" = c(
"Handover To Deliverer",
"Order Delivered",
"Present For Collection",
"Order Fetched")
)
# Draw a process map
process_map(otc_high_level)
# Redraw the trace coverage graph
otc_high_level %>% trace_coverage(level="trace") %>% plot()
# Compute activity wise processing time
otc_high_level %>% processing_time(level="activity", units="days")
## # A tibble: 34 x 11
## activity min q1 mean median q3 max st_dev iqr total
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Packaging 0 0 0 0 0 0 0 0 0
## 2 Prepare Invoice 0 0 0 0 0 0 0 0 0
## 3 Produce Order 0 0 0 0 0 0 0 0 0
## 4 Quality Control 0 0 0 0 0 0 0 0 0
## 5 Assemble Order 0 0 0 0 0 0 0 0 0
## 6 Delivery 0.583 1.99 5.11 3.11 8.06 17.0 3.86 6.07 15452
## 7 Order Materials 0 0 0 0 0 0 0 0 0
## 8 Receive Materi~ 0 0 0 0 0 0 0 0 0
## 9 Receive Sales ~ 0 0 0 0 0 0 0 0 0
## 10 Schedule Job 0 0 0 0 0 0 0 0 0
## # ... with 24 more rows, and 1 more variable: relative_frequency <dbl>
# Plot a resource activity matrix of otc
otc %>% resource_frequency(level = "resource-activity") %>% plot()
# Create otc_selection
otc_selection <- otc %>% filter_activity(activities = c("Send Quotation","Send Invoice"))
# Explore traces
otc %>% trace_explorer(coverage=1)
# Draw a resource map
otc_selection %>% resource_map()
# Create otc_returned
otc_returned <- otc %>% filter_activity_presence("Return Goods")
# Compute percentage of returned orders
n_cases(otc_returned)/n_cases(otc)
## [1] 0.2130923
# Trim cases and visualize
otc_returned %>% filter_trim(start_activities="Return Goods") %>% process_map()
# Time from order to delivery
# otc %>% filter_trim(start_activities="Receive Sales Order", end_activities="Order Delivered") %>%
# processing_time(units="days")
# Plot processing time by type
# otc %>%
# group_by(type) %>%
# throughput_time() %>%
# plot()
Chapter 1 - Hubs of the Network
Network science - include social networks, neural networks, etc.:
Visualizing networks:
Centrality measures:
Example code includes:
# read the nodes file into the variable nodes
nodes <- readr::read_csv("./RInputFiles/nodes.csv")
nodes
# read the ties file into the variable ties
ties <- readr::read_csv("./RInputFiles/ties.csv")
ties
library(igraph)
library(ggraph)
# make the network from the data frame ties and print it
g <- graph_from_data_frame(ties, directed = FALSE, vertices = nodes)
g
# explore the set of nodes
V(g)
# print the number of nodes
vcount(g)
# explore the set of ties
E(g)
# print the number of ties
ecount(g)
# give the name "Madrid network" to the network and print the network `name` attribute
g$name <- "Madrid network"
g$name
# add node attribute id and print the node `id` attribute
V(g)$id <- 1:vcount(g)
V(g)$id
# print the tie `weight` attribute
E(g)$weight
# print the network and spot the attributes
g
# visualize the network with layout Kamada-Kawai
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# add an id label to nodes
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point() +
geom_node_text(aes(label = id), repel=TRUE)
# visualize the network with circular layout. Set tie transparency proportional to its weight
ggraph(g, layout = "in_circle") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# visualize the network with grid layout. Set tie transparency proportional to its weight
ggraph(g, layout = "grid") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point()
# compute the degrees of the nodes
dgr <- degree(g)
# add the degrees to the data frame object
nodes <- mutate(nodes, degree = dgr)
# add the degrees to the network object
V(g)$degree <- dgr
# arrange the terrorists in decreasing order of degree
arrange(nodes, -degree)
# compute node strengths
stg <- strength(g)
# add strength to the data frame object using mutate
nodes <- mutate(nodes, strength = stg)
# add the variable stg to the network object as strength
V(g)$strength <- stg
# arrange terrorists in decreasing order of strength and then in decreasing order of degree
arrange(nodes, -degree)
arrange(nodes, -strength)
Chapter 2 - Weakness and strength
Tie betweenness:
Visualizing centrality measures:
The strength of weak ties:
Example code includes:
# save the inverse of tie weights as dist_weight
dist_weight <- 1 / E(g)$weight
# compute weighted tie betweenness
btw <- edge_betweenness(g, weights = dist_weight)
# mutate the data frame ties adding a variable betweenness using btw
ties <- mutate(ties, betweenness=btw)
# add the tie attribute betweenness to the network
E(g)$betweenness <- btw
# join ties with nodes
ties_joined <- ties %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id"))
# select only relevant variables and save to ties
ties_selected <- ties_joined %>%
select(from, to, name_from = name.x, name_to = name.y, betweenness)
# arrange named ties in decreasing order of betweenness
arrange(ties_selected, -betweenness)
# set (alpha) proportional to weight and node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha=weight)) +
geom_node_point(aes(size=degree))
# produce the same visualization but set node size proportional to strength
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight)) +
geom_node_point(aes(size = strength))
# visualize the network with tie transparency proportional to betweenness
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point()
# add node size proportional to degree
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness)) +
geom_node_point(aes(size = degree))
# find median betweenness
q = median(E(g)$betweenness)
# filter ties with betweenness larger than the median
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = betweenness, filter = (betweenness > q))) +
geom_node_point() +
theme(legend.position="none")
# find number and percentage of weak ties
ties %>%
group_by(weight) %>%
summarise(number = n(), percentage=n()/nrow(.)) %>%
arrange(-number)
# build vector weakness containing TRUE for weak ties
weakness <- ifelse(ties$weight == 1, TRUE, FALSE)
# check that weakness contains the correct number of weak ties
sum(weakness)
# visualize the network by coloring the weak and strong ties
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(color = weakness)) +
geom_node_point()
# visualize the network with only weak ties using the filter aesthetic
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(filter=weakness), alpha = 0.5) +
geom_node_point()
Chapter 3 - Connection patterns
Connection patterns:
Pearson correlation coefficient:
Most similar and most dissimilar terrorists:
Example code includes:
# mutate ties data frame by swapping variables from and to
ties_mutated <- mutate(ties, temp = to, to = from, from = temp) %>% select(-temp)
# append ties_mutated data frame to ties data frame
ties <- rbind(ties, ties_mutated)
# use a scatter plot to visualize node connection patterns in ties setting color aesthetic to weight
ggplot(ties, aes(x = from, y = to, color = factor(weight))) +
geom_point() +
labs(color = "weight")
# get the weighted adjacency matrix
A <- as_adjacency_matrix(g, attr = "weight", sparse = FALSE, names = FALSE)
# print the first row and first column of A
A[1, ]
A[, 1]
# print submatrix of the first 6 rows and columns
A[1:6, 1:6]
# obtain a vector of node strengths
rowSums(A)
# build a Boolean (0/1) matrix from the weighted matrix A
B <- ifelse(A > 0, 1, 0)
# obtain a vector of node degrees using the Boolean matrix
rowSums(B)
# compute the Pearson correlation on columns of A
S <- cor(A)
# set the diagonal of S to 0
diag(S) = 0
# print a summary of the similarities in matrix S
summary(c(S))
# plot a histogram of similarities in matrix S
hist(c(S), xlab = "Similarity", main = "Histogram of similarity")
# Scatter plot of degree and strength with regression line
ggplot(nodes, aes(x = degree, y = strength)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
# Pearson correlation coefficient
cor(nodes$degree, nodes$strength)
# build weighted similarity network and save to h
h <- graph_from_adjacency_matrix(S, mode = "undirected", weighted = TRUE)
# convert the similarity network h into a similarity data frame sim_df
sim_df <- as_data_frame(h, what = "edges")
# map the similarity data frame to a tibble and save it as sim_tib
sim_tib <- as_tibble(sim_df)
# print sim_tib
sim_tib
# left join similarity and nodes data frames and then select and rename relevant variables
sim2 <- sim_tib %>%
left_join(nodes, c("from" = "id")) %>%
left_join(nodes, c("to" = "id")) %>%
select(from, to, name_from = name.x, name_to = name.y, similarity = weight,
degree_from = degree.x, degree_to = degree.y, strength_from = strength.x, strength_to = strength.y)
# print sim2
sim2
# arrange sim2 in decreasing order of similarity.
sim2 %>% arrange(-similarity)
# filter sim2, allowing only pairs with a degree of least 10, arrange the result in decreasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(-similarity)
# Repeat the previous steps, but in increasing order of similarity
sim2 %>%
filter(degree_from >= 10, degree_to >= 10) %>%
arrange(similarity)
# filter the similarity data frame to similarities larger than or equal to 0.60
sim3 <- filter(sim2, similarity >= 0.6)
# build a similarity network called h2 from the filtered similarity data frame
h2 <- graph_from_data_frame(sim3, directed = FALSE)
# visualize the similarity network h2
ggraph(h2, layout = "with_kk") +
geom_edge_link(aes(alpha = similarity)) +
geom_node_point()
Chapter 4 - Similarity Clusters
Hierarchical clustering - find clusters of similar people:
Interactive visualizations with visNetwork:
Wrap up:
Example code includes:
# compute a distance matrix
D <- 1 - S
# obtain a distance object
d <- as.dist(D)
# run average-linkage clustering method and plot the dendrogram
cc <- hclust(d, method = "average")
plot(cc)
# find the similarity of the first pair of nodes that have been merged
S[40, 45]
# cut the dendrogram at 4 clusters
cls <- cutree(cc, k = 4)
# add cluster information to the nodes data frame
nodes <- mutate(nodes, cluster = cls)
# print the nodes data frame
nodes
# output the names of terrorists in the first cluster
filter(nodes, cluster == 1) %>%
select(name)
# for each cluster select the size of the cluster, the average node degree, and the average node strength and sorts by cluster size
group_by(nodes, cluster) %>%
summarise(size = n(),
avg_degree = mean(degree),
avg_strength = mean(strength)
) %>%
arrange(-size)
# add cluster information to the network
V(g)$cluster <- nodes$cluster
# visualize the original network with colored clusters
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
labs(color = "cluster")
# facet the network with respect to cluster attribute
ggraph(g, layout = "with_kk") +
geom_edge_link(aes(alpha = weight), show.legend=FALSE) +
geom_node_point(aes(color = factor(cluster))) +
facet_nodes(~cluster, scales="free") +
labs(color = "cluster")
# convert igraph to visNetwork
data <- visNetwork::toVisNetworkData(g)
# print head of nodes and ties
head(data$nodes)
head(data$edges)
# visualize the network
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300)
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk")
# use the circle layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_in_circle")
# use the grid layout
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_on_grid")
# highlight nearest nodes and ties of the selected node
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(highlightNearest = TRUE)
# select nodes by id
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(nodesIdSelection = TRUE)
# set color to cluster and generate network data
V(g)$color = V(g)$cluster
data <- visNetwork::toVisNetworkData(g)
# select by group (cluster)
visNetwork::visNetwork(nodes = data$nodes, edges = data$edges, width = 300, height = 300) %>%
visNetwork::visIgraphLayout(layout = "layout_with_kk") %>%
visNetwork::visOptions(selectedBy = "group")
Chapter 1 - Introduction to Data Privacy
Intro to Anonymization - Part I:
Intro to Anonymization - Part II:
Data Synthesis:
Example code includes:
load("./RInputFiles/dataPriv.RData")
# Preview data
whitehouse
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Adam W. Employee 66300 Per Annum WESTERN REGIONAL COMMUN~
## 2 Adams, Ian H. Employee 45000 Per Annum EXECUTIVE ASSISTANT TO ~
## 3 Agnew, David P. Employee 93840 Per Annum DEPUTY DIRECTOR OF INTE~
## 4 Albino, James Employee 91800 Per Annum SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., Joseph E. Employee 130500 Per Annum SPECIAL ASSISTANT TO TH~
## 6 Alley, Hilary J. Employee 42000 Per Annum STAFF ASSISTANT
## 7 Amorsingh, Lucius L. Employee 56092 Per Annum SPECIAL ASSISTANT
## 8 Anderson, Amanda D. Employee 60000 Per Annum SPECIAL ASSISTANT TO TH~
## 9 Anderson, Charles D. Employee 51000 Per Annum POLICY ASSISTANT
## 10 Andrias, Kate E. Employee 130500 Per Annum SPECIAL ASSISTANT TO TH~
## # ... with 459 more rows
# Set seed
set.seed(42)
# Replace names with random numbers from 1 to 1000
whitehouse_no_names <- whitehouse %>%
mutate(Name = sample(1:1000, nrow(.), replace=FALSE))
whitehouse_no_names
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 66300 Per Annum WESTERN REGIONAL COMMUNICATIONS DIRECT~
## 2 937 Employee 45000 Per Annum EXECUTIVE ASSISTANT TO THE DIRECTOR OF~
## 3 286 Employee 93840 Per Annum DEPUTY DIRECTOR OF INTERGOVERNMENTAL A~
## 4 828 Employee 91800 Per Annum SENIOR PROGRAM MANAGER
## 5 640 Employee 130500 Per Annum SPECIAL ASSISTANT TO THE PRESIDENT FOR~
## 6 517 Employee 42000 Per Annum STAFF ASSISTANT
## 7 733 Employee 56092 Per Annum SPECIAL ASSISTANT
## 8 134 Employee 60000 Per Annum SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 51000 Per Annum POLICY ASSISTANT
## 10 699 Employee 130500 Per Annum SPECIAL ASSISTANT TO THE PRESIDENT AND~
## # ... with 459 more rows
# Rounding Salary to the nearest ten thousand
whitehouse_no_identifiers <- whitehouse_no_names %>%
mutate(Salary = round(Salary, -4))
whitehouse_no_identifiers
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <int> <chr> <dbl> <chr> <chr>
## 1 915 Employee 70000 Per Annum WESTERN REGIONAL COMMUNICATIONS DIRECT~
## 2 937 Employee 40000 Per Annum EXECUTIVE ASSISTANT TO THE DIRECTOR OF~
## 3 286 Employee 90000 Per Annum DEPUTY DIRECTOR OF INTERGOVERNMENTAL A~
## 4 828 Employee 90000 Per Annum SENIOR PROGRAM MANAGER
## 5 640 Employee 130000 Per Annum SPECIAL ASSISTANT TO THE PRESIDENT FOR~
## 6 517 Employee 40000 Per Annum STAFF ASSISTANT
## 7 733 Employee 60000 Per Annum SPECIAL ASSISTANT
## 8 134 Employee 60000 Per Annum SPECIAL ASSISTANT TO THE CHIEF OF STAFF
## 9 652 Employee 50000 Per Annum POLICY ASSISTANT
## 10 699 Employee 130000 Per Annum SPECIAL ASSISTANT TO THE PRESIDENT AND~
## # ... with 459 more rows
# Convert the salaries into three categories
whitehouse.gen <- whitehouse %>%
mutate(Salary = ifelse(Salary < 50000, 0,
ifelse(Salary >= 50000 & Salary < 100000, 1, 2)))
whitehouse.gen
## # A tibble: 469 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Abrams, Adam W. Employee 1.00 Per Annum WESTERN REGIONAL COMMUN~
## 2 Adams, Ian H. Employee 0 Per Annum EXECUTIVE ASSISTANT TO ~
## 3 Agnew, David P. Employee 1.00 Per Annum DEPUTY DIRECTOR OF INTE~
## 4 Albino, James Employee 1.00 Per Annum SENIOR PROGRAM MANAGER
## 5 Aldy, Jr., Joseph E. Employee 2.00 Per Annum SPECIAL ASSISTANT TO TH~
## 6 Alley, Hilary J. Employee 0 Per Annum STAFF ASSISTANT
## 7 Amorsingh, Lucius L. Employee 1.00 Per Annum SPECIAL ASSISTANT
## 8 Anderson, Amanda D. Employee 1.00 Per Annum SPECIAL ASSISTANT TO TH~
## 9 Anderson, Charles D. Employee 1.00 Per Annum POLICY ASSISTANT
## 10 Andrias, Kate E. Employee 2.00 Per Annum SPECIAL ASSISTANT TO TH~
## # ... with 459 more rows
# Bottom Coding
whitehouse.bottom <- whitehouse %>%
mutate(Salary = pmax(Salary, 45000))
# Filter Results
whitehouse.bottom %>%
filter(Salary <= 45000)
## # A tibble: 109 x 5
## Name Status Salary Basis Title
## <chr> <chr> <dbl> <chr> <chr>
## 1 Adams, Ian H. Employee 45000 Per Annum EXECUTIVE ASSISTANT TO T~
## 2 Alley, Hilary J. Employee 45000 Per Annum STAFF ASSISTANT
## 3 Asen, Jonathan D. Employee 45000 Per Annum SENIOR ANALYST
## 4 Ayling, Lindsay A. Employee 45000 Per Annum ANALYST
## 5 Baggetto, Maude L. Employee 45000 Per Annum STAFF ASSISTANT
## 6 Bates, Andrew J. Employee 45000 Per Annum MEDIA MONITOR
## 7 Belive, Lauren E. Employee 45000 Per Annum LEGISLATIVE ASSISTANT AN~
## 8 Bisi, Rachel I. Employee 45000 Per Annum LEGISLATIVE ASSISTANT
## 9 Block, Michael R. Employee 45000 Per Annum STAFF ASSISTANT
## 10 Blount, Patricia H. Employee 45000 Per Annum RECORDS MANAGEMENT ANALY~
## # ... with 99 more rows
# View fertility data
fertility
## # A tibble: 100 x 10
## Season Age Child_Disease Accident_Trauma Surgical_Interv~ High_Fevers
## <dbl> <dbl> <int> <int> <int> <int>
## 1 -0.330 0.690 0 1 1 0
## 2 -0.330 0.940 1 0 1 0
## 3 -0.330 0.500 1 0 0 0
## 4 -0.330 0.750 0 1 1 0
## 5 -0.330 0.670 1 1 0 0
## 6 -0.330 0.670 1 0 1 0
## 7 -0.330 0.670 0 0 0 -1
## 8 -0.330 1.00 1 1 1 0
## 9 1.00 0.640 0 0 1 0
## 10 1.00 0.610 1 0 0 0
## # ... with 90 more rows, and 4 more variables: Alcohol_Freq <dbl>,
## # Smoking <int>, Hours_Sitting <dbl>, Diagnosis <int>
# Number of participants with Surgical_Intervention and Diagnosis
fertility %>%
summarise_at(vars(Surgical_Intervention, Diagnosis), sum)
## # A tibble: 1 x 2
## Surgical_Intervention Diagnosis
## <int> <int>
## 1 51 12
# Mean and Standard Deviation of Age
fertility %>%
summarise_at(vars(Age), funs(mean, sd))
## # A tibble: 1 x 2
## mean sd
## <dbl> <dbl>
## 1 0.669 0.121
# Counts of the Groups in High_Fevers
fertility %>%
count(High_Fevers)
## # A tibble: 3 x 2
## High_Fevers n
## <int> <int>
## 1 -1 9
## 2 0 63
## 3 1 28
# Counts of the Groups in Child_Disease
fertility %>%
count(Child_Disease, Accident_Trauma)
## # A tibble: 4 x 3
## Child_Disease Accident_Trauma n
## <int> <int> <int>
## 1 0 0 10
## 2 0 1 3
## 3 1 0 46
## 4 1 1 41
# Find proportions
fertility %>%
summarise_at(vars(Accident_Trauma, Surgical_Intervention), mean)
## # A tibble: 1 x 2
## Accident_Trauma Surgical_Intervention
## <dbl> <dbl>
## 1 0.440 0.510
# Set seed
set.seed(42)
# Generate Synthetic data
accident <- rbinom(100, 1, prob=0.440)
surgical <- rbinom(100, 1, prob=0.510)
# Square root Transformation of Salary
whitehouse.salary <- whitehouse %>%
mutate(Salary = sqrt(Salary))
# Calculate the mean and standard deviation
stats <- whitehouse.salary %>%
summarize(mean(Salary), sd(Salary))
stats
## # A tibble: 1 x 2
## `mean(Salary)` `sd(Salary)`
## <dbl> <dbl>
## 1 279 71.8
# Generate Synthetic data
set.seed(42)
salary_transformed <- rnorm(nrow(whitehouse), mean=279, sd=71.8)
# Power transformation
salary_original <- salary_transformed ** 2
# Hard bound
salary <- ifelse(salary_original < 0, 0, salary_original)
Chapter 2 - Introduction to Differential Privacy
Differential Privacy - quantification of privacy loss via a privacy budget:
Global Sensitivity - usual decision-making factor for differential privacy:
Laplace Mechanism - adds noise based on the Laplace distribution with mean 0 and parameters global sensitivity and privacy budget:
Example code includes:
# Number of observations
n <- nrow(fertility)
# Global sensitivity of counts
gs.count <- 1
# Global sensitivity of proportions
gs.prop <- 1/n
# Lower bound of Hours_Sitting
a <- 0
# Upper bound of Hours_Sitting
b <- 1
# Global sensitivity of mean for Hours_Sitting
gs.mean <- (b - a) / n
# Global sensitivity of proportions Hours_Sitting
gs.var <- (b - a)**2 / n
# How many participants had a Surgical_Intervention?
fertility %>%
summarise_at(vars(Surgical_Intervention), sum)
## # A tibble: 1 x 1
## Surgical_Intervention
## <int>
## 1 51
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 51, 1/eps)
## [1] 52.98337
# Proportion of Accident_Trauma
stats <- fertility %>%
summarise_at(vars(Accident_Trauma), mean)
stats
## # A tibble: 1 x 1
## Accident_Trauma
## <dbl>
## 1 0.440
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
eps <- 0.1
smoothmest::rdoublex(1, 0.440, (1/n)/eps)
## [1] 0.4598337
# Mean and Variance of Hours Sitting
fertility %>%
summarise_at(vars(Hours_Sitting), funs(mean, var))
## # A tibble: 1 x 2
## mean var
## <dbl> <dbl>
## 1 0.407 0.0347
# Setup
set.seed(42)
eps <- 0.1
# Laplace mechanism to mean
smoothmest::rdoublex(1, 0.41, gs.mean/eps)
## [1] 0.4298337
# Laplace mechanism to variance
smoothmest::rdoublex(1, 0.03, gs.var/eps)
## [1] 0.0583491
Chapter 3 - Differentially Private Properties
Sequential Composition - method to require that someone cannot find the real answer by just sending multiple queries:
Parallel Composition - method to account for queries to different parts of the database (no adjustment to epsilon needed):
Post-processing:
Impossible and inconsistent answers:
Example code includes:
# Set Value of Epsilon
eps <- 0.1 / 2
# Number of observations
n <- nrow(fertility)
# Lower bound of Age
a <- 0
# Upper bound of Age
b <- 1
# GS of counts for Diagnosis
gs.count <- 1
# GS of mean for Age
gs.mean <- (b-a)/n
# Number of Participants with abnormal diagnosis
stats1 <- fertility %>%
summarize_at(vars(Diagnosis), sum)
stats1
## # A tibble: 1 x 1
## Diagnosis
## <int>
## 1 12
# Mean of age
stats2 <- fertility %>%
summarize_at(vars(Age), mean)
stats2
## # A tibble: 1 x 1
## Age
## <dbl>
## 1 0.669
# Set seed
set.seed(42)
# Laplace mechanism to the count of abnormal diagnosis
smoothmest::rdoublex(1, 12, gs.count/eps)
## [1] 15.96674
# Laplace mechanism to the mean of age
smoothmest::rdoublex(1, 0.67, gs.mean/eps)
## [1] 0.7266982
# Set Value of Epsilon
eps <- 0.1
# Mean of Age per diagnosis level
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the mean age of participants with an abnormal diagnoisis
smoothmest::rdoublex(1, 0.71, gs.mean/eps)
## [1] 0.7298337
# Laplace mechanism to the mean age of participants with a normal diagnoisis
smoothmest::rdoublex(1, 0.66, gs.mean/eps)
## [1] 0.6883491
# Set Value of Epsilon
eps <- 0.5/3
# GS of Counts
gs.count <- 1
# Number of participants in each of the four seasons
fertility %>%
group_by(Diagnosis) %>%
summarise_at(vars(Age), mean)
## # A tibble: 2 x 2
## Diagnosis Age
## <int> <dbl>
## 1 0 0.664
## 2 1 0.707
# Set the seed
set.seed(42)
# Laplace mechanism to the number of participants who were evaluated in the winter, spring, and summer
winter <- smoothmest::rdoublex(1, 28, gs.count / eps) %>%
round()
spring <- smoothmest::rdoublex(1, 37, gs.count / eps) %>%
round()
summer <- smoothmest::rdoublex(1, 4, gs.count / eps) %>%
round()
# Post-process based on previous queries
fall <- nrow(fertility) - winter - spring - summer
# Set Value of Epsilon
eps <- 0.01
# GS of counts
gs.count <- 1
# Number of Participants with Child_Disease
fertility %>%
summarise_at(vars(Child_Disease), sum)
## # A tibble: 1 x 1
## Child_Disease
## <int>
## 1 87
# Apply the Laplace mechanism
set.seed(42)
lap_childhood <- smoothmest::rdoublex(1, 87, gs.count / eps) %>%
round()
# Total number of observations in fertility
max_value <- nrow(fertility)
# Bound the value such that the noisy answer does not exceed the total number of observations
ifelse(lap_childhood > max_value, max_value, lap_childhood)
## [1] 100
# Set the seed
set.seed(42)
# Apply the Laplace mechanism
fever1 <- smoothmest::rdoublex(1, 9, gs.count/eps) %>%
max(0)
fever2 <- smoothmest::rdoublex(1, 63, gs.count/eps) %>%
max(0)
fever3 <- smoothmest::rdoublex(1, 28, gs.count/eps) %>%
max(0)
fever <- c(fever1, fever2, fever3)
# Normalize noise
fever_normalized <- (fever/sum(fever)) * (nrow(fertility))
# Round the values
round(fever_normalized)
## [1] 24 76 0
Chapter 4 - Differentially Private Data Synthesis